valediction 1.0.3__tar.gz → 1.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {valediction-1.0.3 → valediction-1.2.0}/PKG-INFO +1 -1
- {valediction-1.0.3 → valediction-1.2.0}/pyproject.toml +1 -1
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/convenience.py +7 -12
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/datasets/datasets.py +17 -17
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/dictionary/generation.py +5 -5
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/dictionary/helpers.py +0 -7
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/dictionary/importing.py +43 -20
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/dictionary/model.py +108 -36
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/integrity.py +67 -13
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/io/csv_readers.py +3 -3
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/support.py +5 -1
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/validation/helpers.py +30 -33
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/validation/issues.py +37 -25
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/validation/validation.py +102 -53
- {valediction-1.0.3 → valediction-1.2.0}/.gitignore +0 -0
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/__init__.py +0 -0
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/data_types/__init__.py +0 -0
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/data_types/data_type_helpers.py +0 -0
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/data_types/data_types.py +0 -0
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/data_types/type_inference.py +0 -0
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/datasets/__init__.py +0 -0
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/datasets/datasets_helpers.py +0 -0
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/demo/DEMO - Data Dictionary.xlsx +0 -0
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/demo/DEMOGRAPHICS.csv +0 -0
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/demo/DIAGNOSES.csv +0 -0
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/demo/LAB_TESTS.csv +0 -0
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/demo/VITALS.csv +0 -0
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/demo/__init__.py +0 -0
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/demo/demo_dictionary.py +0 -0
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/dictionary/__init__.py +0 -0
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/dictionary/exporting.py +0 -0
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/dictionary/exporting_helpers.py +0 -0
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/dictionary/integrity.py +0 -0
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/dictionary/template/PROJECT - Data Dictionary.xltx +0 -0
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/exceptions.py +0 -0
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/io/__init__.py +0 -0
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/progress.py +0 -0
- {valediction-1.0.3 → valediction-1.2.0}/src/valediction/validation/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: valediction
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.2.0
|
|
4
4
|
Summary: Valediction is a convenience data validation package that allows generation, import, and constraint enforcement of user-defined data dictionaries against datasets.
|
|
5
5
|
Author-email: Cai Davis <Cai.Davis@uhs.nhs.uk>
|
|
6
6
|
Requires-Python: <4.0,>=3.11
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "valediction"
|
|
3
|
-
version = "1.0
|
|
3
|
+
version = "1.2.0"
|
|
4
4
|
description = "Valediction is a convenience data validation package that allows generation, import, and constraint enforcement of user-defined data dictionaries against datasets."
|
|
5
5
|
authors = [{ name = "Cai Davis", email = "Cai.Davis@uhs.nhs.uk" }]
|
|
6
6
|
requires-python = ">=3.11,<4.0"
|
|
@@ -3,12 +3,11 @@ from pathlib import Path
|
|
|
3
3
|
from pandas import DataFrame
|
|
4
4
|
|
|
5
5
|
from valediction.datasets.datasets import Dataset
|
|
6
|
-
from valediction.dictionary.importing import import_dictionary
|
|
7
6
|
from valediction.dictionary.model import Dictionary
|
|
8
7
|
|
|
9
8
|
|
|
10
9
|
def validate(
|
|
11
|
-
|
|
10
|
+
dataset: str | Path | dict[str, DataFrame],
|
|
12
11
|
dictionary: Dictionary | str | Path,
|
|
13
12
|
*,
|
|
14
13
|
import_data: bool = False,
|
|
@@ -31,20 +30,16 @@ def validate(
|
|
|
31
30
|
Returns:
|
|
32
31
|
Dataset: dataset, with or without Issues
|
|
33
32
|
"""
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
else import_dictionary(dictionary)
|
|
38
|
-
)
|
|
39
|
-
data: Dataset = Dataset.create_from(data)
|
|
40
|
-
data.import_dictionary(dictionary)
|
|
33
|
+
|
|
34
|
+
_dataset: Dataset = Dataset.create_from(dataset)
|
|
35
|
+
_dataset.import_dictionary(dictionary)
|
|
41
36
|
|
|
42
37
|
if import_data:
|
|
43
|
-
|
|
38
|
+
_dataset.import_data()
|
|
44
39
|
|
|
45
|
-
|
|
40
|
+
_dataset.validate(
|
|
46
41
|
chunk_size=chunk_size,
|
|
47
42
|
feedback=feedback,
|
|
48
43
|
)
|
|
49
44
|
|
|
50
|
-
return
|
|
45
|
+
return _dataset
|
|
@@ -20,7 +20,8 @@ from valediction.io.csv_readers import (
|
|
|
20
20
|
)
|
|
21
21
|
from valediction.support import (
|
|
22
22
|
_get_runtime_string,
|
|
23
|
-
|
|
23
|
+
_normalise,
|
|
24
|
+
_strip,
|
|
24
25
|
list_as_bullets,
|
|
25
26
|
print_bold_red,
|
|
26
27
|
print_red,
|
|
@@ -135,16 +136,16 @@ class DatasetItem:
|
|
|
135
136
|
# Validation
|
|
136
137
|
def validate(
|
|
137
138
|
self,
|
|
138
|
-
chunk_size: int =
|
|
139
|
+
chunk_size: int | None = 10_000_000,
|
|
139
140
|
feedback: bool = True,
|
|
140
|
-
):
|
|
141
|
+
) -> None:
|
|
141
142
|
"""
|
|
142
143
|
Summary:
|
|
143
144
|
Validates the dataset item against the dictionary.
|
|
144
145
|
Warns if there are issues with the integrity of the data.
|
|
145
146
|
|
|
146
147
|
Arguments:
|
|
147
|
-
chunk_size (int): Size of chunks for validating data to optimise RAM usage,
|
|
148
|
+
chunk_size (int | None): Size of chunks for validating data to optimise RAM usage,
|
|
148
149
|
if reading from CSV (default: 10_000_000)
|
|
149
150
|
feedback (bool): Provide user feedback on progress (default: True)
|
|
150
151
|
|
|
@@ -437,16 +438,16 @@ class Dataset(list[DatasetItem]):
|
|
|
437
438
|
|
|
438
439
|
# Getters
|
|
439
440
|
def get(self, name: str, default: DatasetItem | None = None) -> DatasetItem | None:
|
|
440
|
-
name_key = name
|
|
441
|
+
name_key = _normalise(name)
|
|
441
442
|
for item in self:
|
|
442
|
-
if item.name
|
|
443
|
+
if _normalise(item.name) == name_key:
|
|
443
444
|
return item
|
|
444
445
|
return default
|
|
445
446
|
|
|
446
447
|
def index_of(self, name: str) -> int | None:
|
|
447
|
-
name_key = name
|
|
448
|
+
name_key = _normalise(name)
|
|
448
449
|
for i, item in enumerate(self):
|
|
449
|
-
if item.name == name_key:
|
|
450
|
+
if _normalise(item.name) == name_key:
|
|
450
451
|
return i
|
|
451
452
|
return None
|
|
452
453
|
|
|
@@ -710,7 +711,7 @@ class Dataset(list[DatasetItem]):
|
|
|
710
711
|
# Validation
|
|
711
712
|
def validate(
|
|
712
713
|
self,
|
|
713
|
-
chunk_size: int =
|
|
714
|
+
chunk_size: int | None = 10_000_000,
|
|
714
715
|
feedback: bool = True,
|
|
715
716
|
) -> None:
|
|
716
717
|
"""
|
|
@@ -751,7 +752,7 @@ class Dataset(list[DatasetItem]):
|
|
|
751
752
|
if feedback:
|
|
752
753
|
print("\n", end="")
|
|
753
754
|
|
|
754
|
-
def __reattach_issues(self):
|
|
755
|
+
def __reattach_issues(self) -> None:
|
|
755
756
|
self.issues = Issues()
|
|
756
757
|
for item in self:
|
|
757
758
|
self.issues.extend(item.issues)
|
|
@@ -796,20 +797,21 @@ class Dataset(list[DatasetItem]):
|
|
|
796
797
|
name: str | None,
|
|
797
798
|
data: DataLike,
|
|
798
799
|
) -> DatasetItem:
|
|
799
|
-
"""Normalise a (
|
|
800
|
+
"""Normalise a (name, data) double into a DatasetItem."""
|
|
800
801
|
if isinstance(data, (str, Path)):
|
|
801
802
|
path = Path(data)
|
|
802
803
|
if not path.exists():
|
|
803
804
|
raise FileNotFoundError(f"File not found: {path}")
|
|
804
805
|
if path.suffix.lower() != ".csv":
|
|
805
806
|
raise ValueError(f"Only .csv supported right now, got: {path}")
|
|
806
|
-
resolved_name =
|
|
807
|
+
resolved_name = _strip(name or path.stem)
|
|
807
808
|
return DatasetItem(name=resolved_name, data=path.resolve())
|
|
808
809
|
|
|
809
810
|
if isinstance(data, DataFrame):
|
|
810
811
|
if not name:
|
|
811
812
|
raise ValueError("When providing a DataFrame, 'name' is required.")
|
|
812
|
-
resolved_name =
|
|
813
|
+
resolved_name = _strip(name)
|
|
814
|
+
data.columns = [_strip(column) for column in data.columns]
|
|
813
815
|
return DatasetItem(name=resolved_name, data=data)
|
|
814
816
|
|
|
815
817
|
raise TypeError("data must be a Path/str to .csv or a pandas DataFrame.")
|
|
@@ -823,13 +825,11 @@ class Dataset(list[DatasetItem]):
|
|
|
823
825
|
if p.is_file():
|
|
824
826
|
if p.suffix.lower() != ".csv":
|
|
825
827
|
raise ValueError(f"Expected a .csv file, got: {p.suffix} ({p})")
|
|
826
|
-
return [DatasetItem(name=
|
|
828
|
+
return [DatasetItem(name=_strip(p.stem), data=p.resolve())]
|
|
827
829
|
|
|
828
830
|
if p.is_dir():
|
|
829
831
|
return [
|
|
830
|
-
DatasetItem(
|
|
831
|
-
name=_normalise_name(csv_path.stem), data=csv_path.resolve()
|
|
832
|
-
)
|
|
832
|
+
DatasetItem(name=_strip(csv_path.stem), data=csv_path.resolve())
|
|
833
833
|
for csv_path in p.glob("*.csv")
|
|
834
834
|
]
|
|
835
835
|
|
|
@@ -24,7 +24,7 @@ from valediction.io.csv_readers import (
|
|
|
24
24
|
read_csv_sample,
|
|
25
25
|
)
|
|
26
26
|
from valediction.progress import Progress
|
|
27
|
-
from valediction.support import
|
|
27
|
+
from valediction.support import _strip, calculate_runtime
|
|
28
28
|
|
|
29
29
|
IMPORTING_DATA = "Importing data"
|
|
30
30
|
CHUNK_STEPS = 1
|
|
@@ -124,7 +124,7 @@ class Generator:
|
|
|
124
124
|
self.__say(f"Generating dictionary for {len(items)} tables")
|
|
125
125
|
for item in items:
|
|
126
126
|
self.__progress_init(item)
|
|
127
|
-
table = Table(name=
|
|
127
|
+
table = Table(name=_strip(item.name))
|
|
128
128
|
dictionary.add_table(table)
|
|
129
129
|
|
|
130
130
|
if item.is_path:
|
|
@@ -192,7 +192,7 @@ class Generator:
|
|
|
192
192
|
col_state = inferer.states[col_name]
|
|
193
193
|
data_type, length = col_state.final_data_type_and_length()
|
|
194
194
|
col = Column(
|
|
195
|
-
name=
|
|
195
|
+
name=_strip(col_name),
|
|
196
196
|
order=idx,
|
|
197
197
|
data_type=data_type,
|
|
198
198
|
length=length if data_type == DataType.TEXT else None,
|
|
@@ -242,7 +242,7 @@ class Generator:
|
|
|
242
242
|
col_state = inferer.states[col_name]
|
|
243
243
|
data_type, length = col_state.final_data_type_and_length()
|
|
244
244
|
col = Column(
|
|
245
|
-
name=
|
|
245
|
+
name=_strip(col_name),
|
|
246
246
|
order=idx,
|
|
247
247
|
data_type=data_type,
|
|
248
248
|
length=length if data_type == DataType.TEXT else None,
|
|
@@ -277,7 +277,7 @@ class Generator:
|
|
|
277
277
|
next_order = max((c.order or 0 for c in table), default=0) + 1
|
|
278
278
|
data_type, length = col_state.final_data_type_and_length()
|
|
279
279
|
new_col = Column(
|
|
280
|
-
name=
|
|
280
|
+
name=_strip(col_name),
|
|
281
281
|
order=next_order,
|
|
282
282
|
data_type=data_type,
|
|
283
283
|
length=length if data_type == DataType.TEXT else None,
|
|
@@ -26,9 +26,6 @@ def _check_name(name: str, entity: Literal["table", "column"]) -> list[str]:
|
|
|
26
26
|
else config.max_column_name_length
|
|
27
27
|
)
|
|
28
28
|
|
|
29
|
-
if name != name.upper(): # name must be uppercase
|
|
30
|
-
errors.append("must be uppercase")
|
|
31
|
-
|
|
32
29
|
if invalid_chars.search(name): # check invalid characters
|
|
33
30
|
bad = set(invalid_chars.findall(name))
|
|
34
31
|
errors.append(
|
|
@@ -115,10 +112,6 @@ def _check_primary_key(primary_key: int | None, data_type: DataType) -> list[str
|
|
|
115
112
|
return errors
|
|
116
113
|
|
|
117
114
|
|
|
118
|
-
def _normalise_name(name: str) -> str:
|
|
119
|
-
return name.upper().strip()
|
|
120
|
-
|
|
121
|
-
|
|
122
115
|
def _norm_header_map(columns: list) -> dict:
|
|
123
116
|
mapping, _ = {}, set()
|
|
124
117
|
for c in columns:
|
|
@@ -11,7 +11,6 @@ from valediction.dictionary.helpers import (
|
|
|
11
11
|
_get_required_header,
|
|
12
12
|
_is_missing,
|
|
13
13
|
_norm_header_map,
|
|
14
|
-
_normalise_name,
|
|
15
14
|
_parse_int,
|
|
16
15
|
_parse_truthy,
|
|
17
16
|
_row_is_blank,
|
|
@@ -19,7 +18,7 @@ from valediction.dictionary.helpers import (
|
|
|
19
18
|
from valediction.dictionary.integrity import REQUIRED_SHEETS
|
|
20
19
|
from valediction.dictionary.model import Column, Dictionary, Table
|
|
21
20
|
from valediction.exceptions import DataDictionaryError, DataDictionaryImportError
|
|
22
|
-
from valediction.support import list_as_bullets
|
|
21
|
+
from valediction.support import _normalise, _strip, list_as_bullets
|
|
23
22
|
|
|
24
23
|
|
|
25
24
|
@dataclass
|
|
@@ -80,6 +79,13 @@ class ExcelDataDictionary:
|
|
|
80
79
|
raise error
|
|
81
80
|
|
|
82
81
|
# Import & Helpers
|
|
82
|
+
def _resolve_table_name(self, name: str) -> str | None:
|
|
83
|
+
"""Return the canonical table name as it appears in Tables sheet (or None)."""
|
|
84
|
+
target = _normalise(name)
|
|
85
|
+
return next(
|
|
86
|
+
(t for t in self.table_metadata.keys() if _normalise(t) == target), None
|
|
87
|
+
)
|
|
88
|
+
|
|
83
89
|
def _open_workbook(self) -> None:
|
|
84
90
|
if not self.path.exists():
|
|
85
91
|
raise DataDictionaryImportError(f"File not found: {self.path}")
|
|
@@ -140,20 +146,27 @@ class ExcelDataDictionary:
|
|
|
140
146
|
description_col_header = _get_required_header(header_map, "description")
|
|
141
147
|
|
|
142
148
|
meta: dict[str, str | None] = {}
|
|
149
|
+
seen: set[str] = set()
|
|
150
|
+
|
|
143
151
|
for _, row in tables_df.iterrows():
|
|
144
152
|
if _is_missing(row[table_col_header]):
|
|
145
153
|
continue
|
|
146
|
-
|
|
154
|
+
|
|
155
|
+
table_name = _strip(str(row[table_col_header]))
|
|
147
156
|
table_description = (
|
|
148
157
|
None
|
|
149
158
|
if _is_missing(row[description_col_header])
|
|
150
159
|
else str(row[description_col_header])
|
|
151
160
|
)
|
|
152
|
-
|
|
161
|
+
|
|
162
|
+
key = _normalise(table_name)
|
|
163
|
+
if key in seen:
|
|
153
164
|
raise DataDictionaryImportError(
|
|
154
165
|
f"Duplicate table '{table_name}' in Tables sheet."
|
|
155
166
|
)
|
|
167
|
+
seen.add(key)
|
|
156
168
|
meta[table_name] = table_description
|
|
169
|
+
|
|
157
170
|
if not meta:
|
|
158
171
|
raise DataDictionaryImportError(
|
|
159
172
|
"Data Dictionary sheet 'Tables' contains no table rows."
|
|
@@ -177,12 +190,13 @@ class ExcelDataDictionary:
|
|
|
177
190
|
or _is_missing(row[code_col_header])
|
|
178
191
|
):
|
|
179
192
|
continue
|
|
180
|
-
table_name =
|
|
181
|
-
column_name =
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
]
|
|
193
|
+
table_name = _strip(str(row[table_col_header]))
|
|
194
|
+
column_name = _strip(str(row[column_col_header]))
|
|
195
|
+
resolved_table = self._resolve_table_name(table_name) or table_name
|
|
196
|
+
enum_key = (_normalise(resolved_table), _normalise(column_name))
|
|
197
|
+
enum_map.setdefault(enum_key, {})
|
|
198
|
+
enum_map[enum_key][row[code_col_header]] = row[name_col_header]
|
|
199
|
+
|
|
186
200
|
self.enumerations = enum_map
|
|
187
201
|
|
|
188
202
|
# Parse Columns
|
|
@@ -234,7 +248,12 @@ class ExcelDataDictionary:
|
|
|
234
248
|
|
|
235
249
|
self.table_columns[inputs.table_name].append(column_obj)
|
|
236
250
|
if inputs.has_enumerations:
|
|
237
|
-
self.enum_flags.add(
|
|
251
|
+
self.enum_flags.add(
|
|
252
|
+
(
|
|
253
|
+
_normalise(inputs.table_name),
|
|
254
|
+
_normalise(inputs.column_name),
|
|
255
|
+
)
|
|
256
|
+
)
|
|
238
257
|
|
|
239
258
|
if errors:
|
|
240
259
|
raise DataDictionaryImportError(
|
|
@@ -279,7 +298,7 @@ class ExcelDataDictionary:
|
|
|
279
298
|
|
|
280
299
|
# Validate Foreign Keys
|
|
281
300
|
def _validate_foreign_keys(self) -> None:
|
|
282
|
-
name_to_table = {t.name: t for t in self.tables}
|
|
301
|
+
name_to_table = {_normalise(t.name): t for t in self.tables}
|
|
283
302
|
errors: list[str] = []
|
|
284
303
|
for table in self.tables:
|
|
285
304
|
for column in table:
|
|
@@ -292,9 +311,9 @@ class ExcelDataDictionary:
|
|
|
292
311
|
)
|
|
293
312
|
continue
|
|
294
313
|
target_table_raw, target_column_raw = target.split(".", 1)
|
|
295
|
-
target_table_name =
|
|
296
|
-
target_column_name =
|
|
297
|
-
referenced_table = name_to_table.get(target_table_name)
|
|
314
|
+
target_table_name = _strip(target_table_raw)
|
|
315
|
+
target_column_name = _strip(target_column_raw)
|
|
316
|
+
referenced_table = name_to_table.get(_normalise(target_table_name))
|
|
298
317
|
if not referenced_table:
|
|
299
318
|
errors.append(
|
|
300
319
|
f"{table.name}.{column.name} references unknown table {target_table_name!r}."
|
|
@@ -392,13 +411,17 @@ class ExcelDataDictionary:
|
|
|
392
411
|
f"{row_context}: missing required field(s): {', '.join(missing_fields)}."
|
|
393
412
|
)
|
|
394
413
|
|
|
395
|
-
|
|
396
|
-
column_name =
|
|
397
|
-
|
|
414
|
+
table_name_raw = _strip(str(row[table_col_header]))
|
|
415
|
+
column_name = _strip(str(row[column_col_header]))
|
|
416
|
+
|
|
417
|
+
resolved_table_name = self._resolve_table_name(table_name_raw)
|
|
418
|
+
if resolved_table_name is None:
|
|
398
419
|
raise DataDictionaryImportError(
|
|
399
|
-
f"{row_context}: Table '{
|
|
420
|
+
f"{row_context}: Table '{table_name_raw}' not present in Tables sheet."
|
|
400
421
|
)
|
|
401
422
|
|
|
423
|
+
table_name = resolved_table_name
|
|
424
|
+
|
|
402
425
|
order_int = _parse_int(row[order_col_header], "Order", row_context)
|
|
403
426
|
length_int = (
|
|
404
427
|
_parse_int(row[length_col_header], "Length", row_context, required=False)
|
|
@@ -461,7 +484,7 @@ class ExcelDataDictionary:
|
|
|
461
484
|
|
|
462
485
|
def _make_column(self, inputs: _ColumnInputs) -> Column:
|
|
463
486
|
enums_for_column = self.enumerations.get(
|
|
464
|
-
(inputs.table_name, inputs.column_name), {}
|
|
487
|
+
(_normalise(inputs.table_name), _normalise(inputs.column_name)), {}
|
|
465
488
|
)
|
|
466
489
|
return Column(
|
|
467
490
|
name=inputs.column_name,
|
|
@@ -9,10 +9,9 @@ from valediction.dictionary.helpers import (
|
|
|
9
9
|
_check_name,
|
|
10
10
|
_check_order,
|
|
11
11
|
_check_primary_key,
|
|
12
|
-
_normalise_name,
|
|
13
12
|
)
|
|
14
13
|
from valediction.exceptions import DataDictionaryError
|
|
15
|
-
from valediction.support import list_as_bullets
|
|
14
|
+
from valediction.support import _normalise, _strip, list_as_bullets
|
|
16
15
|
|
|
17
16
|
|
|
18
17
|
class Column:
|
|
@@ -44,7 +43,7 @@ class Column:
|
|
|
44
43
|
description: str | None = None,
|
|
45
44
|
datetime_format: str | None = None,
|
|
46
45
|
):
|
|
47
|
-
self.name =
|
|
46
|
+
self.name = _strip(name)
|
|
48
47
|
self.order = int(order) if order is not None else None
|
|
49
48
|
self.data_type: DataType = None
|
|
50
49
|
self.length = int(length) if length is not None else None
|
|
@@ -127,7 +126,7 @@ class Table(list[Column]):
|
|
|
127
126
|
columns: list[Column] | None = None,
|
|
128
127
|
):
|
|
129
128
|
super().__init__()
|
|
130
|
-
self.name =
|
|
129
|
+
self.name = _strip(name)
|
|
131
130
|
self.description = description
|
|
132
131
|
for column in columns or []:
|
|
133
132
|
self.add_column(column)
|
|
@@ -139,24 +138,28 @@ class Table(list[Column]):
|
|
|
139
138
|
)
|
|
140
139
|
return f"Table(name={self.name!r}, description={self.description!r}{cols_str})"
|
|
141
140
|
|
|
141
|
+
def __key(self, name: str) -> str:
|
|
142
|
+
return _normalise(name)
|
|
143
|
+
|
|
142
144
|
def __getitem__(self, key: int | str) -> Column:
|
|
143
145
|
if isinstance(key, int):
|
|
144
146
|
return super().__getitem__(key)
|
|
145
|
-
|
|
146
|
-
|
|
147
|
+
|
|
148
|
+
target_key = self.__key(key)
|
|
149
|
+
found = next((c for c in self if self.__key(c.name) == target_key), None)
|
|
147
150
|
if not found:
|
|
148
151
|
raise KeyError(f"Column {key!r} not found in table {self.name!r}.")
|
|
149
152
|
return found
|
|
150
153
|
|
|
151
154
|
def __get(self, name: str, default: Column | None = None) -> Column | None:
|
|
152
|
-
|
|
153
|
-
return next((c for c in self if c.name ==
|
|
155
|
+
target_key = self.__key(name)
|
|
156
|
+
return next((c for c in self if self.__key(c.name) == target_key), default)
|
|
154
157
|
|
|
155
158
|
# Getters
|
|
156
159
|
def index_of(self, name: str) -> int | None:
|
|
157
|
-
|
|
160
|
+
target_key = self.__key(name)
|
|
158
161
|
for i, c in enumerate(self):
|
|
159
|
-
if c.name ==
|
|
162
|
+
if self.__key(c.name) == target_key:
|
|
160
163
|
return i
|
|
161
164
|
return None
|
|
162
165
|
|
|
@@ -303,16 +306,17 @@ class Table(list[Column]):
|
|
|
303
306
|
if not isinstance(column, Column):
|
|
304
307
|
raise DataDictionaryError("Only Column objects can be added to a Table.")
|
|
305
308
|
|
|
306
|
-
|
|
307
|
-
|
|
309
|
+
incoming_key = self.__key(column.name)
|
|
310
|
+
conflict = next((c for c in self if self.__key(c.name) == incoming_key), None)
|
|
311
|
+
if conflict is not None:
|
|
308
312
|
raise DataDictionaryError(
|
|
309
|
-
f"Column {column.name!r} already exists (order={conflict.order!r})"
|
|
313
|
+
f"Column {column.name!r} already exists (order={conflict.order!r}, as {conflict.name!r})."
|
|
310
314
|
)
|
|
311
315
|
|
|
312
316
|
if column.order in self.get_column_orders():
|
|
313
|
-
|
|
317
|
+
conflict_by_order = self.get_column(column.order)
|
|
314
318
|
raise DataDictionaryError(
|
|
315
|
-
f"Order {column.order!r} already exists (name={
|
|
319
|
+
f"Order {column.order!r} already exists (name={conflict_by_order.name!r})"
|
|
316
320
|
)
|
|
317
321
|
|
|
318
322
|
if column.primary_key is not None:
|
|
@@ -339,10 +343,7 @@ class Table(list[Column]):
|
|
|
339
343
|
Raises:
|
|
340
344
|
DataDictionaryError: if the column does not exist
|
|
341
345
|
"""
|
|
342
|
-
|
|
343
|
-
name = self.get_column(column).name
|
|
344
|
-
else:
|
|
345
|
-
name = self.get_column(column).name # by order
|
|
346
|
+
name = self.get_column(column).name
|
|
346
347
|
remaining = [c for c in self if c.name != name]
|
|
347
348
|
self.clear()
|
|
348
349
|
super().extend(remaining)
|
|
@@ -367,16 +368,17 @@ class Table(list[Column]):
|
|
|
367
368
|
for col in self:
|
|
368
369
|
col.primary_key = None
|
|
369
370
|
|
|
370
|
-
# Resolve and
|
|
371
|
+
# Resolve and deduplicate
|
|
371
372
|
resolved: list[Column] = []
|
|
372
373
|
seen: set[str] = set()
|
|
373
374
|
for key in primary_keys:
|
|
374
375
|
col = self.get_column(key)
|
|
375
|
-
|
|
376
|
+
col_key = self.__key(col.name)
|
|
377
|
+
if col_key in seen:
|
|
376
378
|
raise DataDictionaryError(
|
|
377
379
|
f"Duplicate column {col.name!r} provided for table {self.name!r}."
|
|
378
380
|
)
|
|
379
|
-
seen.add(
|
|
381
|
+
seen.add(col_key)
|
|
380
382
|
resolved.append(col)
|
|
381
383
|
|
|
382
384
|
# Assign ordinals 1..N
|
|
@@ -416,14 +418,20 @@ class Dictionary(list[Table]):
|
|
|
416
418
|
):
|
|
417
419
|
super().__init__()
|
|
418
420
|
self.name = name
|
|
421
|
+
|
|
422
|
+
if isinstance(tables, Table):
|
|
423
|
+
tables = [tables]
|
|
424
|
+
|
|
419
425
|
for t in tables or []:
|
|
420
426
|
self.add_table(t)
|
|
427
|
+
|
|
421
428
|
self.organisations = organisations
|
|
422
429
|
self.version = version
|
|
423
430
|
self.version_notes = version_notes
|
|
424
431
|
self.inclusion_criteria = inclusion_criteria
|
|
425
432
|
self.exclusion_criteria = exclusion_criteria
|
|
426
433
|
self.imported = imported
|
|
434
|
+
self.__check_variables()
|
|
427
435
|
|
|
428
436
|
# Properties
|
|
429
437
|
@property
|
|
@@ -439,24 +447,85 @@ class Dictionary(list[Table]):
|
|
|
439
447
|
tables = list_as_bullets(elements=[str(t) for t in self], bullet="\n- ")
|
|
440
448
|
return f"Dictionary(name={self.name!r}, imported={self.imported!r}, {tables})"
|
|
441
449
|
|
|
450
|
+
def __key(self, name: str) -> str:
|
|
451
|
+
return _normalise(name)
|
|
452
|
+
|
|
442
453
|
def __getitem__(self, key: int | str) -> Table:
|
|
443
454
|
if isinstance(key, int):
|
|
444
455
|
return super().__getitem__(key)
|
|
445
|
-
|
|
446
|
-
|
|
456
|
+
|
|
457
|
+
target_key = self.__key(key)
|
|
458
|
+
found = next((t for t in self if self.__key(t.name) == target_key), None)
|
|
447
459
|
if not found:
|
|
448
460
|
raise KeyError(f"Table {key!r} not found in Dictionary.")
|
|
449
461
|
return found
|
|
450
462
|
|
|
451
|
-
# Getters
|
|
452
463
|
def __get(self, name: str, default: Table | None = None) -> Table | None:
|
|
453
|
-
|
|
454
|
-
return next((t for t in self if t.name ==
|
|
464
|
+
target_key = self.__key(name)
|
|
465
|
+
return next((t for t in self if self.__key(t.name) == target_key), default)
|
|
466
|
+
|
|
467
|
+
# Checkers
|
|
468
|
+
def __check_variables(self) -> None:
|
|
469
|
+
self.__check_name()
|
|
470
|
+
self.__check_organisations()
|
|
471
|
+
self.__check_version()
|
|
472
|
+
self.__check_version_notes()
|
|
473
|
+
self.__check_criteria()
|
|
474
|
+
|
|
475
|
+
def __check_name(self) -> None:
|
|
476
|
+
# Check name
|
|
477
|
+
if self.name is not None:
|
|
478
|
+
if not isinstance(self.name, str):
|
|
479
|
+
raise DataDictionaryError("Dictionary `name` must be a string.")
|
|
480
|
+
|
|
481
|
+
def __check_organisations(self) -> None:
|
|
482
|
+
# Check organisations
|
|
483
|
+
if self.organisations is not None:
|
|
484
|
+
if not isinstance(self.organisations, str):
|
|
485
|
+
raise DataDictionaryError(
|
|
486
|
+
"Dictionary `organisations` must be a string."
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
def __check_version(self) -> None:
|
|
490
|
+
# Check version
|
|
491
|
+
if self.version is not None:
|
|
492
|
+
if not isinstance(self.version, (str, int, float)):
|
|
493
|
+
raise DataDictionaryError(
|
|
494
|
+
"Dictionary `version` must be a string, int, or float."
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
if isinstance(self.version, (int, float)):
|
|
498
|
+
self.version = str(self.version)
|
|
499
|
+
|
|
500
|
+
# Check version_notes
|
|
455
501
|
|
|
502
|
+
def __check_version_notes(self) -> None:
|
|
503
|
+
if self.version_notes is not None:
|
|
504
|
+
if not isinstance(self.version_notes, str):
|
|
505
|
+
raise DataDictionaryError(
|
|
506
|
+
"Dictionary `version_notes` must be a string."
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
def __check_criteria(self) -> None:
|
|
510
|
+
# Check inclusion_criteria
|
|
511
|
+
if self.inclusion_criteria is not None:
|
|
512
|
+
if not isinstance(self.inclusion_criteria, str):
|
|
513
|
+
raise DataDictionaryError(
|
|
514
|
+
"Dictionary `inclusion_criteria` must be a string."
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
# Check exclusion_criteria
|
|
518
|
+
if self.exclusion_criteria is not None:
|
|
519
|
+
if not isinstance(self.exclusion_criteria, str):
|
|
520
|
+
raise DataDictionaryError(
|
|
521
|
+
"Dictionary exclusion_criteria must be a string."
|
|
522
|
+
)
|
|
523
|
+
|
|
524
|
+
# Getters
|
|
456
525
|
def index_of(self, name: str) -> int | None:
|
|
457
|
-
|
|
526
|
+
target_key = self.__key(name)
|
|
458
527
|
for i, t in enumerate(self):
|
|
459
|
-
if t.name ==
|
|
528
|
+
if self.__key(t.name) == target_key:
|
|
460
529
|
return i
|
|
461
530
|
return None
|
|
462
531
|
|
|
@@ -484,12 +553,9 @@ class Dictionary(list[Table]):
|
|
|
484
553
|
Raises:
|
|
485
554
|
KeyError: If the table is not found in the dictionary.
|
|
486
555
|
"""
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
if not found:
|
|
556
|
+
found = self.__get(table)
|
|
557
|
+
if found is None:
|
|
491
558
|
raise KeyError(f"Table {table!r} not found in Dictionary.")
|
|
492
|
-
|
|
493
559
|
return found
|
|
494
560
|
|
|
495
561
|
# Manipulation
|
|
@@ -508,8 +574,14 @@ class Dictionary(list[Table]):
|
|
|
508
574
|
raise DataDictionaryError(
|
|
509
575
|
"Only Table objects can be added to a Dictionary."
|
|
510
576
|
)
|
|
511
|
-
|
|
512
|
-
|
|
577
|
+
|
|
578
|
+
incoming_key = self.__key(table.name)
|
|
579
|
+
conflict = next((t for t in self if self.__key(t.name) == incoming_key), None)
|
|
580
|
+
if conflict is not None:
|
|
581
|
+
raise DataDictionaryError(
|
|
582
|
+
f"Table {table.name!r} already exists (as {conflict.name!r})."
|
|
583
|
+
)
|
|
584
|
+
|
|
513
585
|
super().append(table)
|
|
514
586
|
|
|
515
587
|
def remove_table(self, table: str) -> None:
|