valediction 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. valediction/__init__.py +8 -0
  2. valediction/convenience.py +50 -0
  3. valediction/data_types/__init__.py +0 -0
  4. valediction/data_types/data_type_helpers.py +75 -0
  5. valediction/data_types/data_types.py +58 -0
  6. valediction/data_types/type_inference.py +541 -0
  7. valediction/datasets/__init__.py +0 -0
  8. valediction/datasets/datasets.py +870 -0
  9. valediction/datasets/datasets_helpers.py +46 -0
  10. valediction/demo/DEMO - Data Dictionary.xlsx +0 -0
  11. valediction/demo/DEMOGRAPHICS.csv +101 -0
  12. valediction/demo/DIAGNOSES.csv +650 -0
  13. valediction/demo/LAB_TESTS.csv +1001 -0
  14. valediction/demo/VITALS.csv +1001 -0
  15. valediction/demo/__init__.py +6 -0
  16. valediction/demo/demo_dictionary.py +129 -0
  17. valediction/dictionary/__init__.py +0 -0
  18. valediction/dictionary/exporting.py +501 -0
  19. valediction/dictionary/exporting_helpers.py +371 -0
  20. valediction/dictionary/generation.py +357 -0
  21. valediction/dictionary/helpers.py +174 -0
  22. valediction/dictionary/importing.py +494 -0
  23. valediction/dictionary/integrity.py +37 -0
  24. valediction/dictionary/model.py +582 -0
  25. valediction/dictionary/template/PROJECT - Data Dictionary.xltx +0 -0
  26. valediction/exceptions.py +22 -0
  27. valediction/integrity.py +97 -0
  28. valediction/io/__init__.py +0 -0
  29. valediction/io/csv_readers.py +307 -0
  30. valediction/progress.py +206 -0
  31. valediction/support.py +72 -0
  32. valediction/validation/__init__.py +0 -0
  33. valediction/validation/helpers.py +315 -0
  34. valediction/validation/issues.py +280 -0
  35. valediction/validation/validation.py +598 -0
  36. valediction-1.0.0.dist-info/METADATA +15 -0
  37. valediction-1.0.0.dist-info/RECORD +38 -0
  38. valediction-1.0.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,494 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ import pandas as pd
8
+
9
+ from valediction.data_types.data_types import DataType
10
+ from valediction.dictionary.helpers import (
11
+ _get_required_header,
12
+ _is_missing,
13
+ _norm_header_map,
14
+ _normalise_name,
15
+ _parse_int,
16
+ _parse_truthy,
17
+ _row_is_blank,
18
+ )
19
+ from valediction.dictionary.integrity import REQUIRED_SHEETS
20
+ from valediction.dictionary.model import Column, Dictionary, Table
21
+ from valediction.exceptions import DataDictionaryError, DataDictionaryImportError
22
+ from valediction.support import list_as_bullets
23
+
24
+
25
+ @dataclass
26
+ class _ColumnInputs:
27
+ table_name: str
28
+ column_name: str
29
+ order_int: int
30
+ data_type: DataType
31
+ length_int: int | None
32
+ vocabulary: str | None
33
+ primary_key_int: int | None
34
+ foreign_key_target: str | None
35
+ description: str | None
36
+ has_enumerations: bool
37
+ row_context: str
38
+
39
+
40
+ class ExcelDataDictionary:
41
+ def __init__(self, filepath: str | Path):
42
+ self.path = Path(filepath)
43
+ self.excel_file: pd.ExcelFile | None = None
44
+ self.sheet_map: dict[str, str] = {}
45
+
46
+ # parsed artefacts
47
+ self.details: dict[str, Any] = {}
48
+ self.table_metadata: dict[str, str | None] = {}
49
+ self.enumerations: dict[tuple[str, str], dict[Any, Any]] = {}
50
+ self.enum_flags: set[tuple[str, str]] = set()
51
+ self.table_columns: dict[str, list[Column]] = {}
52
+ self.tables: list[Table] = []
53
+
54
+ # Public API
55
+ def to_dictionary(self) -> Dictionary:
56
+ self._open_workbook()
57
+ try:
58
+ self._load_sheet_map()
59
+ self._parse_details()
60
+ self._parse_tables()
61
+ self._parse_enumerations()
62
+ self._parse_columns()
63
+ self._validate_enum_flags()
64
+ self._build_tables()
65
+ self._validate_foreign_keys()
66
+ self.excel_file.close()
67
+
68
+ return Dictionary(
69
+ name=(self.details.get("name") or None),
70
+ tables=self.tables,
71
+ organisations=(self.details.get("organisations") or None),
72
+ version=(self.details.get("version") or None),
73
+ version_notes=(self.details.get("version_notes") or None),
74
+ inclusion_criteria=(self.details.get("inclusion_criteria") or None),
75
+ exclusion_criteria=(self.details.get("exclusion_criteria") or None),
76
+ imported=True,
77
+ )
78
+ except Exception as error:
79
+ self.excel_file.close()
80
+ raise error
81
+
82
+ # Import & Helpers
83
+ def _open_workbook(self) -> None:
84
+ if not self.path.exists():
85
+ raise DataDictionaryImportError(f"File not found: {self.path}")
86
+ try:
87
+ self.excel_file = pd.ExcelFile(self.path, engine="openpyxl")
88
+ except Exception as e:
89
+ raise DataDictionaryImportError(
90
+ f"Unable to open Excel file {self.path}: {e}"
91
+ ) from e
92
+
93
+ def _load_sheet_map(self) -> None:
94
+ assert self.excel_file is not None
95
+ self.sheet_map = {
96
+ sheet_name.strip().lower(): sheet_name
97
+ for sheet_name in self.excel_file.sheet_names
98
+ if isinstance(sheet_name, str)
99
+ }
100
+ missing = sorted(REQUIRED_SHEETS - set(self.sheet_map))
101
+ if missing:
102
+ raise DataDictionaryImportError(
103
+ "Missing sheet(s): "
104
+ + ", ".join(missing)
105
+ + f". Found sheets: {', '.join(self.excel_file.sheet_names)}"
106
+ )
107
+
108
+ def _read_sheet(self, key: str, **kwargs: Any) -> pd.DataFrame:
109
+ assert self.excel_file is not None
110
+ if key not in self.sheet_map:
111
+ raise DataDictionaryImportError(f"Sheet {key!r} not found in workbook.")
112
+ return pd.read_excel(self.excel_file, sheet_name=self.sheet_map[key], **kwargs)
113
+
114
+ # Parse Details
115
+ def _parse_details(self) -> None:
116
+ details_df = self._read_sheet(
117
+ "details", header=None, dtype=str, keep_default_na=False
118
+ )
119
+ keys = {
120
+ "name": "name",
121
+ "organisation(s)": "organisations",
122
+ "version": "version",
123
+ "version notes": "version_notes",
124
+ "inclusion criteria": "inclusion_criteria",
125
+ "exclusion criteria": "exclusion_criteria",
126
+ }
127
+ self.details = {v: None for v in keys.values()}
128
+ for _, row in details_df.iterrows():
129
+ if len(row) < 2 or not isinstance(row.iloc[0], str):
130
+ continue
131
+ key_norm = row.iloc[0].strip().lower()
132
+ if key_norm in keys:
133
+ self.details[keys[key_norm]] = row.iloc[1] or None
134
+
135
+ # Parse Tables
136
+ def _parse_tables(self) -> None:
137
+ tables_df = self._read_sheet("tables", dtype=str, keep_default_na=False)
138
+ header_map = _norm_header_map(tables_df.columns)
139
+ table_col_header = _get_required_header(header_map, "table")
140
+ description_col_header = _get_required_header(header_map, "description")
141
+
142
+ meta: dict[str, str | None] = {}
143
+ for _, row in tables_df.iterrows():
144
+ if _is_missing(row[table_col_header]):
145
+ continue
146
+ table_name = _normalise_name(str(row[table_col_header]))
147
+ table_description = (
148
+ None
149
+ if _is_missing(row[description_col_header])
150
+ else str(row[description_col_header])
151
+ )
152
+ if table_name in meta:
153
+ raise DataDictionaryImportError(
154
+ f"Duplicate table '{table_name}' in Tables sheet."
155
+ )
156
+ meta[table_name] = table_description
157
+ if not meta:
158
+ raise DataDictionaryImportError(
159
+ "Data Dictionary sheet 'Tables' contains no table rows."
160
+ )
161
+ self.table_metadata = meta
162
+
163
+ # Parse Enumerations
164
+ def _parse_enumerations(self) -> None:
165
+ enums_df = self._read_sheet("enumerations", dtype=str)
166
+ header_map = _norm_header_map(enums_df.columns)
167
+ table_col_header = _get_required_header(header_map, "table")
168
+ column_col_header = _get_required_header(header_map, "column")
169
+ code_col_header = _get_required_header(header_map, "code")
170
+ name_col_header = _get_required_header(header_map, "name")
171
+
172
+ enum_map: dict[tuple[str, str], dict[Any, Any]] = {}
173
+ for _, row in enums_df.iterrows():
174
+ if (
175
+ _is_missing(row[table_col_header])
176
+ or _is_missing(row[column_col_header])
177
+ or _is_missing(row[code_col_header])
178
+ ):
179
+ continue
180
+ table_name = _normalise_name(str(row[table_col_header]))
181
+ column_name = _normalise_name(str(row[column_col_header]))
182
+ enum_map.setdefault((table_name, column_name), {})
183
+ enum_map[(table_name, column_name)][row[code_col_header]] = row[
184
+ name_col_header
185
+ ]
186
+ self.enumerations = enum_map
187
+
188
+ # Parse Columns
189
+ def _parse_columns(self) -> None:
190
+ columns_df, header_map = self._prepare_columns_sheet()
191
+ (
192
+ table_col_header,
193
+ column_col_header,
194
+ order_col_header,
195
+ data_type_col_header,
196
+ length_col_header,
197
+ vocabulary_col_header,
198
+ enumeration_flag_col_header,
199
+ primary_key_col_header,
200
+ foreign_key_col_header,
201
+ description_col_header,
202
+ ) = self._columns_headers(header_map)
203
+
204
+ self.table_columns = {t: [] for t in self.table_metadata}
205
+ errors: list[str] = []
206
+
207
+ for idx, row in columns_df.iterrows():
208
+ if _row_is_blank(row, (table_col_header, column_col_header)):
209
+ continue
210
+ try:
211
+ inputs = self._extract_column_inputs(
212
+ idx=idx,
213
+ row=row,
214
+ table_col_header=table_col_header,
215
+ column_col_header=column_col_header,
216
+ order_col_header=order_col_header,
217
+ data_type_col_header=data_type_col_header,
218
+ length_col_header=length_col_header,
219
+ vocabulary_col_header=vocabulary_col_header,
220
+ enumeration_flag_col_header=enumeration_flag_col_header,
221
+ primary_key_col_header=primary_key_col_header,
222
+ foreign_key_col_header=foreign_key_col_header,
223
+ description_col_header=description_col_header,
224
+ )
225
+ except DataDictionaryImportError as e:
226
+ errors.append(str(e))
227
+ continue
228
+
229
+ try:
230
+ column_obj = self._make_column(inputs)
231
+ except DataDictionaryError as e:
232
+ errors.append(f"{inputs.row_context}: {e}")
233
+ continue
234
+
235
+ self.table_columns[inputs.table_name].append(column_obj)
236
+ if inputs.has_enumerations:
237
+ self.enum_flags.add((inputs.table_name, inputs.column_name))
238
+
239
+ if errors:
240
+ raise DataDictionaryImportError(
241
+ "Errors while parsing Columns sheet:\n" + list_as_bullets(errors)
242
+ ) from None
243
+
244
+ # Validate Enumeration Flags
245
+ def _validate_enum_flags(self) -> None:
246
+ missing: list[str] = []
247
+ for key in self.enum_flags:
248
+ if key not in self.enumerations or not self.enumerations[key]:
249
+ table_name, column_name = key
250
+ missing.append(
251
+ f"{table_name}.{column_name} marked as having enumerations but none defined in Enumerations sheet."
252
+ )
253
+ if missing:
254
+ # Template issue, not model construction → ImportError
255
+ raise DataDictionaryImportError(
256
+ "Missing enumerations:\n" + list_as_bullets(missing)
257
+ )
258
+
259
+ # Build Tables
260
+ def _build_tables(self) -> None:
261
+ self.tables = []
262
+ for table_name, table_description in self.table_metadata.items():
263
+ columns_for_table = self.table_columns.get(table_name, [])
264
+ if not columns_for_table:
265
+ raise DataDictionaryImportError(
266
+ f"Table '{table_name}' has no columns defined in Columns sheet."
267
+ )
268
+ try:
269
+ self.tables.append(
270
+ Table(
271
+ name=table_name,
272
+ description=table_description,
273
+ columns=columns_for_table,
274
+ )
275
+ )
276
+ except DataDictionaryError as e:
277
+ # model-level errors (e.g., duplicate orders inside Table) bubble as DataDictionaryError
278
+ raise DataDictionaryImportError(f"In table {table_name!r}: {e}") from e
279
+
280
+ # Validate Foreign Keys
281
+ def _validate_foreign_keys(self) -> None:
282
+ name_to_table = {t.name: t for t in self.tables}
283
+ errors: list[str] = []
284
+ for table in self.tables:
285
+ for column in table:
286
+ if not column.foreign_key:
287
+ continue
288
+ target = column.foreign_key.strip()
289
+ if target.count(".") != 1:
290
+ errors.append(
291
+ f"{table.name}.{column.name} foreign key must be 'TABLE.COLUMN' (got {target!r})."
292
+ )
293
+ continue
294
+ target_table_raw, target_column_raw = target.split(".", 1)
295
+ target_table_name = _normalise_name(target_table_raw)
296
+ target_column_name = _normalise_name(target_column_raw)
297
+ referenced_table = name_to_table.get(target_table_name)
298
+ if not referenced_table:
299
+ errors.append(
300
+ f"{table.name}.{column.name} references unknown table {target_table_name!r}."
301
+ )
302
+ continue
303
+ try:
304
+ referenced_table.get_column(target_column_name)
305
+ except KeyError:
306
+ errors.append(
307
+ f"{table.name}.{column.name} references unknown column {target_table_name}.{target_column_name}."
308
+ )
309
+ if errors:
310
+ # Template issue → ImportError
311
+ raise DataDictionaryImportError(
312
+ "Foreign key validation errors:\n" + list_as_bullets(errors)
313
+ )
314
+
315
+ # Parse Columns Helpers
316
+ def _prepare_columns_sheet(self) -> tuple[pd.DataFrame, dict[str, str]]:
317
+ columns_df = self._read_sheet("columns", dtype=str, keep_default_na=False)
318
+ header_map = _norm_header_map(columns_df.columns)
319
+ # ensure required headers exist
320
+ for key in ("table", "column", "order", "data_type"):
321
+ _get_required_header(header_map, key)
322
+ return columns_df, header_map
323
+
324
+ def _columns_headers(
325
+ self, header_map: dict[str, str]
326
+ ) -> tuple[
327
+ str,
328
+ str,
329
+ str,
330
+ str,
331
+ str | None,
332
+ str | None,
333
+ str | None,
334
+ str | None,
335
+ str | None,
336
+ str | None,
337
+ ]:
338
+ table_col_header = _get_required_header(header_map, "table")
339
+ column_col_header = _get_required_header(header_map, "column")
340
+ order_col_header = _get_required_header(header_map, "order")
341
+ data_type_col_header = _get_required_header(header_map, "data_type")
342
+
343
+ length_col_header = header_map.get("length")
344
+ vocabulary_col_header = header_map.get("vocabularies")
345
+ enumeration_flag_col_header = header_map.get("enumerations")
346
+ primary_key_col_header = header_map.get("primary_key")
347
+ foreign_key_col_header = header_map.get("foreign_key_target")
348
+ description_col_header = header_map.get("description")
349
+ return (
350
+ table_col_header,
351
+ column_col_header,
352
+ order_col_header,
353
+ data_type_col_header,
354
+ length_col_header,
355
+ vocabulary_col_header,
356
+ enumeration_flag_col_header,
357
+ primary_key_col_header,
358
+ foreign_key_col_header,
359
+ description_col_header,
360
+ )
361
+
362
+ def _extract_column_inputs(
363
+ self,
364
+ *,
365
+ idx: int,
366
+ row: pd.Series,
367
+ table_col_header: str,
368
+ column_col_header: str,
369
+ order_col_header: str,
370
+ data_type_col_header: str,
371
+ length_col_header: str | None,
372
+ vocabulary_col_header: str | None,
373
+ enumeration_flag_col_header: str | None,
374
+ primary_key_col_header: str | None,
375
+ foreign_key_col_header: str | None,
376
+ description_col_header: str | None,
377
+ ) -> _ColumnInputs:
378
+ row_context = f"(Columns row {idx + 2})"
379
+
380
+ # Required presence
381
+ missing_fields: list[str] = []
382
+ if _is_missing(row[table_col_header]):
383
+ missing_fields.append("Table")
384
+ if _is_missing(row[column_col_header]):
385
+ missing_fields.append("Column")
386
+ if _is_missing(row[data_type_col_header]):
387
+ missing_fields.append("Data Type")
388
+ if _is_missing(row[order_col_header]):
389
+ missing_fields.append("Order")
390
+ if missing_fields:
391
+ raise DataDictionaryImportError(
392
+ f"{row_context}: missing required field(s): {', '.join(missing_fields)}."
393
+ )
394
+
395
+ table_name = _normalise_name(str(row[table_col_header]))
396
+ column_name = _normalise_name(str(row[column_col_header]))
397
+ if table_name not in self.table_metadata:
398
+ raise DataDictionaryImportError(
399
+ f"{row_context}: Table '{table_name}' not present in Tables sheet."
400
+ )
401
+
402
+ order_int = _parse_int(row[order_col_header], "Order", row_context)
403
+ length_int = (
404
+ _parse_int(row[length_col_header], "Length", row_context, required=False)
405
+ if length_col_header
406
+ else None
407
+ )
408
+ primary_key_int = (
409
+ _parse_int(
410
+ row[primary_key_col_header], "Primary Key", row_context, required=False
411
+ )
412
+ if primary_key_col_header
413
+ else None
414
+ )
415
+
416
+ vocabulary = (
417
+ str(row[vocabulary_col_header]).strip()
418
+ if (vocabulary_col_header and not _is_missing(row[vocabulary_col_header]))
419
+ else None
420
+ )
421
+ foreign_key_target = (
422
+ str(row[foreign_key_col_header]).strip()
423
+ if (foreign_key_col_header and not _is_missing(row[foreign_key_col_header]))
424
+ else None
425
+ )
426
+ description = (
427
+ str(row[description_col_header]).strip()
428
+ if (description_col_header and not _is_missing(row[description_col_header]))
429
+ else None
430
+ )
431
+
432
+ try:
433
+ data_type = DataType.parse(str(row[data_type_col_header]))
434
+ except Exception as e:
435
+ raise DataDictionaryImportError(
436
+ f"{row_context}: invalid Data Type {row[data_type_col_header]!r}: {e}"
437
+ ) from e
438
+
439
+ has_enumerations = (
440
+ _parse_truthy(row[enumeration_flag_col_header])
441
+ if (
442
+ enumeration_flag_col_header
443
+ and not _is_missing(row[enumeration_flag_col_header])
444
+ )
445
+ else False
446
+ )
447
+
448
+ return _ColumnInputs(
449
+ table_name=table_name,
450
+ column_name=column_name,
451
+ order_int=order_int,
452
+ data_type=data_type,
453
+ length_int=length_int,
454
+ vocabulary=vocabulary,
455
+ primary_key_int=primary_key_int,
456
+ foreign_key_target=foreign_key_target,
457
+ description=description,
458
+ has_enumerations=has_enumerations,
459
+ row_context=row_context,
460
+ )
461
+
462
+ def _make_column(self, inputs: _ColumnInputs) -> Column:
463
+ enums_for_column = self.enumerations.get(
464
+ (inputs.table_name, inputs.column_name), {}
465
+ )
466
+ return Column(
467
+ name=inputs.column_name,
468
+ order=inputs.order_int,
469
+ data_type=inputs.data_type,
470
+ length=inputs.length_int,
471
+ vocabulary=inputs.vocabulary,
472
+ primary_key=inputs.primary_key_int,
473
+ foreign_key=inputs.foreign_key_target,
474
+ description=inputs.description,
475
+ enumerations=enums_for_column,
476
+ )
477
+
478
+
479
+ # Public Entry
480
+ def import_dictionary(filepath: str | Path) -> Dictionary:
481
+ """
482
+ Summary:
483
+ Import an Excel data dictionary into a Python Dictionary object.
484
+
485
+ Args:
486
+ filepath (str | Path): Path to the Excel data dictionary file.
487
+
488
+ Returns:
489
+ Dictionary: A Python Dictionary object created from the Excel data dictionary.
490
+
491
+ Raises:
492
+ DataDictionaryImportError: If there is an error importing the data dictionary.
493
+ """
494
+ return ExcelDataDictionary(filepath).to_dictionary()
@@ -0,0 +1,37 @@
1
+ REQUIRED_SHEETS = {"details", "tables", "columns", "enumerations"}
2
+
3
+ S_DETAILS = "details"
4
+ S_TABLES = "tables"
5
+ S_COLUMNS = "columns"
6
+ S_ENUMERATIONS = "enumerations"
7
+
8
+ T_TABLES = "tables"
9
+ T_COLUMNS = "columns"
10
+ T_ENUMERATIONS = "enumerations"
11
+
12
+ C_CHECKS = "checks"
13
+
14
+ DD_TABLE_MAP = {
15
+ S_DETAILS: None,
16
+ S_TABLES: T_TABLES,
17
+ S_COLUMNS: T_COLUMNS,
18
+ S_ENUMERATIONS: T_ENUMERATIONS,
19
+ }
20
+
21
+ DD_COLUMN_MAP = {
22
+ S_DETAILS: None,
23
+ S_TABLES: ["table", "description", C_CHECKS],
24
+ S_COLUMNS: [
25
+ "table",
26
+ "column",
27
+ "order",
28
+ "data type",
29
+ "length",
30
+ "vocabularies",
31
+ "enumerations",
32
+ "primary key",
33
+ "column description",
34
+ C_CHECKS,
35
+ ],
36
+ S_ENUMERATIONS: ["table", "column", "code", "description", C_CHECKS],
37
+ }