valediction 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- valediction/__init__.py +8 -8
- valediction/convenience.py +45 -50
- valediction/data_types/data_type_helpers.py +75 -75
- valediction/data_types/data_types.py +58 -58
- valediction/data_types/type_inference.py +541 -541
- valediction/datasets/datasets.py +870 -870
- valediction/datasets/datasets_helpers.py +46 -46
- valediction/demo/DEMOGRAPHICS.csv +101 -101
- valediction/demo/DIAGNOSES.csv +650 -650
- valediction/demo/LAB_TESTS.csv +1001 -1001
- valediction/demo/VITALS.csv +1001 -1001
- valediction/demo/__init__.py +6 -6
- valediction/demo/demo_dictionary.py +129 -129
- valediction/dictionary/exporting.py +501 -501
- valediction/dictionary/exporting_helpers.py +371 -371
- valediction/dictionary/generation.py +357 -357
- valediction/dictionary/helpers.py +174 -174
- valediction/dictionary/importing.py +494 -494
- valediction/dictionary/integrity.py +37 -37
- valediction/dictionary/model.py +582 -582
- valediction/exceptions.py +22 -22
- valediction/integrity.py +97 -97
- valediction/io/csv_readers.py +307 -307
- valediction/progress.py +206 -206
- valediction/support.py +72 -72
- valediction/validation/helpers.py +315 -315
- valediction/validation/issues.py +280 -280
- valediction/validation/validation.py +598 -598
- {valediction-1.0.0.dist-info → valediction-1.1.0.dist-info}/METADATA +1 -1
- valediction-1.1.0.dist-info/RECORD +38 -0
- {valediction-1.0.0.dist-info → valediction-1.1.0.dist-info}/WHEEL +1 -1
- valediction-1.0.0.dist-info/RECORD +0 -38
|
@@ -1,174 +1,174 @@
|
|
|
1
|
-
import re
|
|
2
|
-
from typing import Any, Literal
|
|
3
|
-
|
|
4
|
-
from pandas import Series
|
|
5
|
-
from pandas import isna as _pd_isna
|
|
6
|
-
|
|
7
|
-
from valediction.data_types.data_types import DataType
|
|
8
|
-
from valediction.exceptions import DataDictionaryImportError
|
|
9
|
-
from valediction.integrity import get_config
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def _check_name(name: str, entity: Literal["table", "column"]) -> list[str]:
|
|
13
|
-
if entity not in ["table", "column"]:
|
|
14
|
-
raise ValueError("entity must be either 'table' or 'column'")
|
|
15
|
-
|
|
16
|
-
errors: list = []
|
|
17
|
-
config = get_config()
|
|
18
|
-
invalid_chars = (
|
|
19
|
-
config.invalid_name_pattern
|
|
20
|
-
if isinstance(config.invalid_name_pattern, re.Pattern)
|
|
21
|
-
else re.compile(config.invalid_name_pattern)
|
|
22
|
-
)
|
|
23
|
-
max_name_length = (
|
|
24
|
-
config.max_table_name_length
|
|
25
|
-
if entity == "table"
|
|
26
|
-
else config.max_column_name_length
|
|
27
|
-
)
|
|
28
|
-
|
|
29
|
-
if name != name.upper(): # name must be uppercase
|
|
30
|
-
errors.append("must be uppercase")
|
|
31
|
-
|
|
32
|
-
if invalid_chars.search(name): # check invalid characters
|
|
33
|
-
bad = set(invalid_chars.findall(name))
|
|
34
|
-
errors.append(
|
|
35
|
-
f"invalid characters: '{''.join(sorted(bad))}'; "
|
|
36
|
-
"only A-Z, 0-9, and underscores are allowed with no whitespace"
|
|
37
|
-
)
|
|
38
|
-
|
|
39
|
-
if len(name) > max_name_length: # max length 30
|
|
40
|
-
errors.append(f"exceeds max length of {max_name_length}")
|
|
41
|
-
|
|
42
|
-
if not name[0].isalpha(): # column starts with a letter
|
|
43
|
-
errors.append("must start with a letter")
|
|
44
|
-
|
|
45
|
-
if name.endswith("_"): # column cannot end with an underscore
|
|
46
|
-
errors.append("cannot end with '_'")
|
|
47
|
-
|
|
48
|
-
if "__" in name: # column cannot contain double underscores
|
|
49
|
-
errors.append("cannot contain double underscores '__'")
|
|
50
|
-
|
|
51
|
-
return errors
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
def _check_order(order: int | None) -> list[str]:
|
|
55
|
-
errors: list = []
|
|
56
|
-
if order is None: # presence
|
|
57
|
-
errors.append("order is required and must be an integer ≥ 1")
|
|
58
|
-
return errors
|
|
59
|
-
|
|
60
|
-
if not isinstance(order, int): # type integer
|
|
61
|
-
errors.append("order must be an integer ≥ 1")
|
|
62
|
-
return errors
|
|
63
|
-
|
|
64
|
-
if order < 1: # must be ≥ 1
|
|
65
|
-
errors.append("order must be ≥ 1")
|
|
66
|
-
return errors
|
|
67
|
-
|
|
68
|
-
return errors
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
def _check_data_type(data_type: DataType, length: int | None) -> list[str]:
|
|
72
|
-
errors: list = []
|
|
73
|
-
if not isinstance(data_type, DataType): # Ensure is a DataType
|
|
74
|
-
errors.append("data type is invalid; must be a DataType object")
|
|
75
|
-
|
|
76
|
-
if length is not None: # length rules
|
|
77
|
-
if not isinstance(length, int):
|
|
78
|
-
errors.append("length must be an positive integer if provided")
|
|
79
|
-
if length <= 0: # must be positive
|
|
80
|
-
errors.append("length must be an positive integer if provided")
|
|
81
|
-
|
|
82
|
-
if data_type == DataType.TEXT: # required for DataType.TEXT
|
|
83
|
-
if length is None:
|
|
84
|
-
errors.append("length is required for TEXT columns")
|
|
85
|
-
else:
|
|
86
|
-
if length is not None: # length not applicable
|
|
87
|
-
errors.append(f"length is not applicable to {data_type.value} columns")
|
|
88
|
-
|
|
89
|
-
return errors
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
def _check_primary_key(primary_key: int | None, data_type: DataType) -> list[str]:
|
|
93
|
-
errors: list = []
|
|
94
|
-
if primary_key is None:
|
|
95
|
-
return errors
|
|
96
|
-
|
|
97
|
-
if (
|
|
98
|
-
not isinstance(primary_key, int)
|
|
99
|
-
or primary_key < 1
|
|
100
|
-
or primary_key > get_config().max_primary_keys
|
|
101
|
-
):
|
|
102
|
-
errors.append(
|
|
103
|
-
"primary key order must be an integer between 1 and 7 if provided"
|
|
104
|
-
)
|
|
105
|
-
|
|
106
|
-
if (
|
|
107
|
-
hasattr(data_type, "valid_for_primary_key")
|
|
108
|
-
and not data_type.valid_for_primary_key()
|
|
109
|
-
):
|
|
110
|
-
errors.append(
|
|
111
|
-
f"invalid data type '{data_type.value}' for primary key column; "
|
|
112
|
-
"primary keys must be Text, Integer, Date, or Datetime"
|
|
113
|
-
)
|
|
114
|
-
|
|
115
|
-
return errors
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
def _normalise_name(name: str) -> str:
|
|
119
|
-
return name.upper().strip()
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
def _norm_header_map(columns: list) -> dict:
|
|
123
|
-
mapping, _ = {}, set()
|
|
124
|
-
for c in columns:
|
|
125
|
-
k = str(c).strip().lower().replace(" ", "_").replace("-", "_")
|
|
126
|
-
if k in mapping: # collision
|
|
127
|
-
raise DataDictionaryImportError(
|
|
128
|
-
f"Ambiguous headers after normalisation: {mapping[k]!r} and {c!r} both map to {k!r}"
|
|
129
|
-
)
|
|
130
|
-
mapping[k] = c
|
|
131
|
-
return mapping
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
def _get_required_header(header_map: dict[str, str], key: str) -> str:
|
|
135
|
-
if key not in header_map:
|
|
136
|
-
raise DataDictionaryImportError(
|
|
137
|
-
f"Required Data Dictionary column '{key}' not found. Available: {list(header_map.keys())}"
|
|
138
|
-
)
|
|
139
|
-
return header_map[key]
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
def _is_missing(val: Any) -> bool:
|
|
143
|
-
return _pd_isna(val) or (isinstance(val, str) and val.strip() == "")
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
def _parse_truthy(val: Any) -> bool:
|
|
147
|
-
if isinstance(val, str):
|
|
148
|
-
return val.strip().lower() in {"y", "yes", "true", "1"}
|
|
149
|
-
if isinstance(val, (int, float)):
|
|
150
|
-
try:
|
|
151
|
-
return int(val) == 1
|
|
152
|
-
except Exception:
|
|
153
|
-
return False
|
|
154
|
-
return False
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
def _row_is_blank(row: Series, keys: tuple[str, str]) -> bool:
|
|
158
|
-
a, b = keys
|
|
159
|
-
return _is_missing(row[a]) and _is_missing(row[b])
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
def _parse_int(
|
|
163
|
-
value: Any, label: str, row_ctx: str, *, required: bool = True
|
|
164
|
-
) -> int | None:
|
|
165
|
-
if _is_missing(value):
|
|
166
|
-
if required:
|
|
167
|
-
raise DataDictionaryImportError(f"{row_ctx}: {label} is required.")
|
|
168
|
-
return None
|
|
169
|
-
try:
|
|
170
|
-
return int(value)
|
|
171
|
-
except Exception as e:
|
|
172
|
-
raise DataDictionaryImportError(
|
|
173
|
-
f"{row_ctx}: {label} must be integer (got {value!r})."
|
|
174
|
-
) from e
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any, Literal
|
|
3
|
+
|
|
4
|
+
from pandas import Series
|
|
5
|
+
from pandas import isna as _pd_isna
|
|
6
|
+
|
|
7
|
+
from valediction.data_types.data_types import DataType
|
|
8
|
+
from valediction.exceptions import DataDictionaryImportError
|
|
9
|
+
from valediction.integrity import get_config
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _check_name(name: str, entity: Literal["table", "column"]) -> list[str]:
|
|
13
|
+
if entity not in ["table", "column"]:
|
|
14
|
+
raise ValueError("entity must be either 'table' or 'column'")
|
|
15
|
+
|
|
16
|
+
errors: list = []
|
|
17
|
+
config = get_config()
|
|
18
|
+
invalid_chars = (
|
|
19
|
+
config.invalid_name_pattern
|
|
20
|
+
if isinstance(config.invalid_name_pattern, re.Pattern)
|
|
21
|
+
else re.compile(config.invalid_name_pattern)
|
|
22
|
+
)
|
|
23
|
+
max_name_length = (
|
|
24
|
+
config.max_table_name_length
|
|
25
|
+
if entity == "table"
|
|
26
|
+
else config.max_column_name_length
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
if name != name.upper(): # name must be uppercase
|
|
30
|
+
errors.append("must be uppercase")
|
|
31
|
+
|
|
32
|
+
if invalid_chars.search(name): # check invalid characters
|
|
33
|
+
bad = set(invalid_chars.findall(name))
|
|
34
|
+
errors.append(
|
|
35
|
+
f"invalid characters: '{''.join(sorted(bad))}'; "
|
|
36
|
+
"only A-Z, 0-9, and underscores are allowed with no whitespace"
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
if len(name) > max_name_length: # max length 30
|
|
40
|
+
errors.append(f"exceeds max length of {max_name_length}")
|
|
41
|
+
|
|
42
|
+
if not name[0].isalpha(): # column starts with a letter
|
|
43
|
+
errors.append("must start with a letter")
|
|
44
|
+
|
|
45
|
+
if name.endswith("_"): # column cannot end with an underscore
|
|
46
|
+
errors.append("cannot end with '_'")
|
|
47
|
+
|
|
48
|
+
if "__" in name: # column cannot contain double underscores
|
|
49
|
+
errors.append("cannot contain double underscores '__'")
|
|
50
|
+
|
|
51
|
+
return errors
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _check_order(order: int | None) -> list[str]:
|
|
55
|
+
errors: list = []
|
|
56
|
+
if order is None: # presence
|
|
57
|
+
errors.append("order is required and must be an integer ≥ 1")
|
|
58
|
+
return errors
|
|
59
|
+
|
|
60
|
+
if not isinstance(order, int): # type integer
|
|
61
|
+
errors.append("order must be an integer ≥ 1")
|
|
62
|
+
return errors
|
|
63
|
+
|
|
64
|
+
if order < 1: # must be ≥ 1
|
|
65
|
+
errors.append("order must be ≥ 1")
|
|
66
|
+
return errors
|
|
67
|
+
|
|
68
|
+
return errors
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _check_data_type(data_type: DataType, length: int | None) -> list[str]:
|
|
72
|
+
errors: list = []
|
|
73
|
+
if not isinstance(data_type, DataType): # Ensure is a DataType
|
|
74
|
+
errors.append("data type is invalid; must be a DataType object")
|
|
75
|
+
|
|
76
|
+
if length is not None: # length rules
|
|
77
|
+
if not isinstance(length, int):
|
|
78
|
+
errors.append("length must be an positive integer if provided")
|
|
79
|
+
if length <= 0: # must be positive
|
|
80
|
+
errors.append("length must be an positive integer if provided")
|
|
81
|
+
|
|
82
|
+
if data_type == DataType.TEXT: # required for DataType.TEXT
|
|
83
|
+
if length is None:
|
|
84
|
+
errors.append("length is required for TEXT columns")
|
|
85
|
+
else:
|
|
86
|
+
if length is not None: # length not applicable
|
|
87
|
+
errors.append(f"length is not applicable to {data_type.value} columns")
|
|
88
|
+
|
|
89
|
+
return errors
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _check_primary_key(primary_key: int | None, data_type: DataType) -> list[str]:
|
|
93
|
+
errors: list = []
|
|
94
|
+
if primary_key is None:
|
|
95
|
+
return errors
|
|
96
|
+
|
|
97
|
+
if (
|
|
98
|
+
not isinstance(primary_key, int)
|
|
99
|
+
or primary_key < 1
|
|
100
|
+
or primary_key > get_config().max_primary_keys
|
|
101
|
+
):
|
|
102
|
+
errors.append(
|
|
103
|
+
"primary key order must be an integer between 1 and 7 if provided"
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
if (
|
|
107
|
+
hasattr(data_type, "valid_for_primary_key")
|
|
108
|
+
and not data_type.valid_for_primary_key()
|
|
109
|
+
):
|
|
110
|
+
errors.append(
|
|
111
|
+
f"invalid data type '{data_type.value}' for primary key column; "
|
|
112
|
+
"primary keys must be Text, Integer, Date, or Datetime"
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
return errors
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _normalise_name(name: str) -> str:
|
|
119
|
+
return name.upper().strip()
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _norm_header_map(columns: list) -> dict:
|
|
123
|
+
mapping, _ = {}, set()
|
|
124
|
+
for c in columns:
|
|
125
|
+
k = str(c).strip().lower().replace(" ", "_").replace("-", "_")
|
|
126
|
+
if k in mapping: # collision
|
|
127
|
+
raise DataDictionaryImportError(
|
|
128
|
+
f"Ambiguous headers after normalisation: {mapping[k]!r} and {c!r} both map to {k!r}"
|
|
129
|
+
)
|
|
130
|
+
mapping[k] = c
|
|
131
|
+
return mapping
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _get_required_header(header_map: dict[str, str], key: str) -> str:
|
|
135
|
+
if key not in header_map:
|
|
136
|
+
raise DataDictionaryImportError(
|
|
137
|
+
f"Required Data Dictionary column '{key}' not found. Available: {list(header_map.keys())}"
|
|
138
|
+
)
|
|
139
|
+
return header_map[key]
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _is_missing(val: Any) -> bool:
|
|
143
|
+
return _pd_isna(val) or (isinstance(val, str) and val.strip() == "")
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _parse_truthy(val: Any) -> bool:
|
|
147
|
+
if isinstance(val, str):
|
|
148
|
+
return val.strip().lower() in {"y", "yes", "true", "1"}
|
|
149
|
+
if isinstance(val, (int, float)):
|
|
150
|
+
try:
|
|
151
|
+
return int(val) == 1
|
|
152
|
+
except Exception:
|
|
153
|
+
return False
|
|
154
|
+
return False
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _row_is_blank(row: Series, keys: tuple[str, str]) -> bool:
|
|
158
|
+
a, b = keys
|
|
159
|
+
return _is_missing(row[a]) and _is_missing(row[b])
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _parse_int(
|
|
163
|
+
value: Any, label: str, row_ctx: str, *, required: bool = True
|
|
164
|
+
) -> int | None:
|
|
165
|
+
if _is_missing(value):
|
|
166
|
+
if required:
|
|
167
|
+
raise DataDictionaryImportError(f"{row_ctx}: {label} is required.")
|
|
168
|
+
return None
|
|
169
|
+
try:
|
|
170
|
+
return int(value)
|
|
171
|
+
except Exception as e:
|
|
172
|
+
raise DataDictionaryImportError(
|
|
173
|
+
f"{row_ctx}: {label} must be integer (got {value!r})."
|
|
174
|
+
) from e
|