valediction 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- valediction/__init__.py +8 -0
- valediction/convenience.py +50 -0
- valediction/data_types/__init__.py +0 -0
- valediction/data_types/data_type_helpers.py +75 -0
- valediction/data_types/data_types.py +58 -0
- valediction/data_types/type_inference.py +541 -0
- valediction/datasets/__init__.py +0 -0
- valediction/datasets/datasets.py +870 -0
- valediction/datasets/datasets_helpers.py +46 -0
- valediction/demo/DEMO - Data Dictionary.xlsx +0 -0
- valediction/demo/DEMOGRAPHICS.csv +101 -0
- valediction/demo/DIAGNOSES.csv +650 -0
- valediction/demo/LAB_TESTS.csv +1001 -0
- valediction/demo/VITALS.csv +1001 -0
- valediction/demo/__init__.py +6 -0
- valediction/demo/demo_dictionary.py +129 -0
- valediction/dictionary/__init__.py +0 -0
- valediction/dictionary/exporting.py +501 -0
- valediction/dictionary/exporting_helpers.py +371 -0
- valediction/dictionary/generation.py +357 -0
- valediction/dictionary/helpers.py +174 -0
- valediction/dictionary/importing.py +494 -0
- valediction/dictionary/integrity.py +37 -0
- valediction/dictionary/model.py +582 -0
- valediction/dictionary/template/PROJECT - Data Dictionary.xltx +0 -0
- valediction/exceptions.py +22 -0
- valediction/integrity.py +97 -0
- valediction/io/__init__.py +0 -0
- valediction/io/csv_readers.py +307 -0
- valediction/progress.py +206 -0
- valediction/support.py +72 -0
- valediction/validation/__init__.py +0 -0
- valediction/validation/helpers.py +315 -0
- valediction/validation/issues.py +280 -0
- valediction/validation/validation.py +598 -0
- valediction-1.0.0.dist-info/METADATA +15 -0
- valediction-1.0.0.dist-info/RECORD +38 -0
- valediction-1.0.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,371 @@
|
|
|
1
|
+
from copy import copy
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Any, Dict, Iterable, List, Mapping, Tuple
|
|
4
|
+
|
|
5
|
+
from openpyxl.formatting.formatting import (
|
|
6
|
+
ConditionalFormatting,
|
|
7
|
+
ConditionalFormattingList,
|
|
8
|
+
)
|
|
9
|
+
from openpyxl.utils import get_column_letter, range_boundaries
|
|
10
|
+
from openpyxl.worksheet.cell_range import MultiCellRange
|
|
11
|
+
from openpyxl.worksheet.table import Table as ExcelTable
|
|
12
|
+
from openpyxl.worksheet.worksheet import Worksheet
|
|
13
|
+
|
|
14
|
+
from valediction.exceptions import DataDictionaryError, DataDictionaryImportError
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class CFRuleInfo:
|
|
19
|
+
ranges: list[str]
|
|
20
|
+
type: str | None = None
|
|
21
|
+
formula: list[str] | None = None
|
|
22
|
+
operator: str | None = None
|
|
23
|
+
dxfId: int | None = None
|
|
24
|
+
priority: int | None = None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class DVRuleInfo:
|
|
29
|
+
ranges: list[str]
|
|
30
|
+
type: str | None = None
|
|
31
|
+
operator: str | None = None
|
|
32
|
+
formula1: str | None = None
|
|
33
|
+
formula2: str | None = None
|
|
34
|
+
allowBlank: bool | None = None
|
|
35
|
+
showErrorMessage: bool | None = None
|
|
36
|
+
showInputMessage: bool | None = None
|
|
37
|
+
promptTitle: str | None = None
|
|
38
|
+
prompt: str | None = None
|
|
39
|
+
errorTitle: str | None = None
|
|
40
|
+
error: str | None = None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class CalculatedColInfo:
|
|
45
|
+
column_name: str
|
|
46
|
+
column_index: int
|
|
47
|
+
header_cell: str
|
|
48
|
+
formula_sample: str
|
|
49
|
+
coverage: float
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _collect_conditional_formats(worksheet: Worksheet) -> Iterable[CFRuleInfo]:
|
|
53
|
+
"""Yield CF rules for a worksheet with their target ranges.
|
|
54
|
+
|
|
55
|
+
Works across openpyxl versions by checking internal/public mappings.
|
|
56
|
+
"""
|
|
57
|
+
cf = getattr(worksheet, "conditional_formatting", None)
|
|
58
|
+
if cf is None:
|
|
59
|
+
return
|
|
60
|
+
yield # for typing
|
|
61
|
+
|
|
62
|
+
items = None
|
|
63
|
+
# Preferred: internal mapping {sqref(tuple-of-ranges or string): [Rule,...]}
|
|
64
|
+
if hasattr(cf, "_cf_rules") and isinstance(cf._cf_rules, dict):
|
|
65
|
+
items = list(cf._cf_rules.items())
|
|
66
|
+
# Fallback for some versions
|
|
67
|
+
elif hasattr(cf, "cf_rules") and hasattr(cf, "ranges"):
|
|
68
|
+
items = [(rng, cf.cf_rules(rng)) for rng in cf.ranges]
|
|
69
|
+
|
|
70
|
+
if not items:
|
|
71
|
+
return
|
|
72
|
+
|
|
73
|
+
for sqref, rules in items:
|
|
74
|
+
# normalize ranges to list[str]
|
|
75
|
+
if isinstance(sqref, (list, tuple)):
|
|
76
|
+
ranges = [str(r) for r in sqref]
|
|
77
|
+
else:
|
|
78
|
+
ranges = [str(sqref)]
|
|
79
|
+
for rule in rules:
|
|
80
|
+
yield CFRuleInfo(
|
|
81
|
+
ranges=ranges,
|
|
82
|
+
type=getattr(rule, "type", None),
|
|
83
|
+
formula=getattr(rule, "formula", None),
|
|
84
|
+
operator=getattr(rule, "operator", None),
|
|
85
|
+
dxfId=getattr(rule, "dxfId", None),
|
|
86
|
+
priority=getattr(rule, "priority", None),
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _collect_data_validations(worksheet: Worksheet) -> Iterable[DVRuleInfo]:
|
|
91
|
+
"""Yield data validation rules for a worksheet with their target ranges."""
|
|
92
|
+
dv_list = getattr(worksheet, "data_validations", None)
|
|
93
|
+
if dv_list is None:
|
|
94
|
+
return
|
|
95
|
+
yield # typing
|
|
96
|
+
|
|
97
|
+
# In openpyxl, ws.data_validations.dataValidation is a list of DataValidation
|
|
98
|
+
dvals = getattr(dv_list, "dataValidation", None)
|
|
99
|
+
if not dvals:
|
|
100
|
+
return
|
|
101
|
+
|
|
102
|
+
for dv in dvals:
|
|
103
|
+
# sqref is a MultiCellRange; convert to list[str]
|
|
104
|
+
ranges: list[str] = []
|
|
105
|
+
sqref = getattr(dv, "sqref", None) or getattr(dv, "ranges", None)
|
|
106
|
+
if sqref is not None:
|
|
107
|
+
try:
|
|
108
|
+
# MultiCellRange: iterate .ranges -> CellRange objects
|
|
109
|
+
ranges = [str(r) for r in sqref.ranges]
|
|
110
|
+
except Exception:
|
|
111
|
+
# fallback to string
|
|
112
|
+
ranges = [str(sqref)]
|
|
113
|
+
|
|
114
|
+
yield DVRuleInfo(
|
|
115
|
+
ranges=ranges,
|
|
116
|
+
type=getattr(dv, "type", None),
|
|
117
|
+
operator=getattr(dv, "operator", None),
|
|
118
|
+
formula1=getattr(dv, "formula1", None),
|
|
119
|
+
formula2=getattr(dv, "formula2", None),
|
|
120
|
+
allowBlank=getattr(dv, "allowBlank", None),
|
|
121
|
+
showErrorMessage=getattr(dv, "showErrorMessage", None),
|
|
122
|
+
showInputMessage=getattr(dv, "showInputMessage", None),
|
|
123
|
+
promptTitle=getattr(dv, "promptTitle", None),
|
|
124
|
+
prompt=getattr(dv, "prompt", None),
|
|
125
|
+
errorTitle=getattr(dv, "errorTitle", None),
|
|
126
|
+
error=getattr(dv, "error", None),
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _collect_table_formulas(table: ExcelTable) -> dict[str, str]:
|
|
131
|
+
out: dict[str, str] = {}
|
|
132
|
+
for column in getattr(table, "tableColumns", []):
|
|
133
|
+
name = (getattr(column, "name", "") or "").strip()
|
|
134
|
+
if not name:
|
|
135
|
+
continue
|
|
136
|
+
formula = getattr(column, "calculatedColumnFormula", None)
|
|
137
|
+
if not formula:
|
|
138
|
+
continue
|
|
139
|
+
text = (
|
|
140
|
+
getattr(formula, "attr_text", None)
|
|
141
|
+
or getattr(formula, "text", None)
|
|
142
|
+
or str(formula)
|
|
143
|
+
)
|
|
144
|
+
if text and not str(text).startswith("="):
|
|
145
|
+
text = f"={text}"
|
|
146
|
+
out[name.lower()] = str(text)
|
|
147
|
+
return out
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _table_column_index_map(tbl: ExcelTable) -> dict[str, int]:
|
|
151
|
+
cols = getattr(tbl, "tableColumns", None)
|
|
152
|
+
if not cols:
|
|
153
|
+
return {}
|
|
154
|
+
|
|
155
|
+
name_to_idx: dict[str, int] = {}
|
|
156
|
+
for idx, col in enumerate(list(cols), start=1):
|
|
157
|
+
raw = (getattr(col, "name", "") or "").strip()
|
|
158
|
+
if not raw:
|
|
159
|
+
continue
|
|
160
|
+
key = raw.lower()
|
|
161
|
+
if key in name_to_idx:
|
|
162
|
+
raise DataDictionaryImportError(
|
|
163
|
+
f"Duplicate column name (case-insensitive) in table '{getattr(tbl, 'displayName', getattr(tbl, 'name', ''))}': '{raw}'"
|
|
164
|
+
)
|
|
165
|
+
name_to_idx[key] = idx
|
|
166
|
+
return name_to_idx
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _cf_entries(cf: ConditionalFormattingList) -> list[tuple[object, list]]:
|
|
170
|
+
"""Return [(key, rules)] from ws.conditional_formatting or [] if unsupported."""
|
|
171
|
+
if hasattr(cf, "_cf_rules") and isinstance(cf._cf_rules, dict):
|
|
172
|
+
return list(cf._cf_rules.items())
|
|
173
|
+
return []
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _key_ranges(key: ConditionalFormatting | MultiCellRange) -> list[str]:
|
|
177
|
+
"""Normalize a CF dict key into a list of A1 range strings."""
|
|
178
|
+
if hasattr(key, "sqref") and hasattr(key.sqref, "ranges"): # ConditionalFormatting
|
|
179
|
+
return [str(r) for r in key.sqref.ranges]
|
|
180
|
+
if hasattr(key, "ranges"): # MultiCellRange
|
|
181
|
+
return [str(r) for r in key.ranges]
|
|
182
|
+
if isinstance(key, (list, tuple)):
|
|
183
|
+
return [str(r) for r in key]
|
|
184
|
+
return [str(key)]
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def _extend_ranges(
|
|
188
|
+
range_strs: List[str], t_min_c: int, t_max_c: int, old_bottom: int, new_row: int
|
|
189
|
+
) -> Tuple[List[str], bool]:
|
|
190
|
+
"""Compute new sqref ranges; return (ranges, changed?)."""
|
|
191
|
+
new_ranges: List[str] = []
|
|
192
|
+
changed = False
|
|
193
|
+
for rng in range_strs:
|
|
194
|
+
try:
|
|
195
|
+
c1, r1, c2, r2 = range_boundaries(rng)
|
|
196
|
+
except ValueError:
|
|
197
|
+
new_ranges.append(rng)
|
|
198
|
+
continue
|
|
199
|
+
if r1 <= new_row <= r2:
|
|
200
|
+
new_ranges.append(rng)
|
|
201
|
+
continue
|
|
202
|
+
overlaps_cols = not (c2 < t_min_c or c1 > t_max_c)
|
|
203
|
+
if r2 == old_bottom and overlaps_cols:
|
|
204
|
+
r2 = new_row
|
|
205
|
+
changed = True
|
|
206
|
+
new_ranges.append(f"{get_column_letter(c1)}{r1}:{get_column_letter(c2)}{r2}")
|
|
207
|
+
return new_ranges, changed
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def _extend_cf_for_new_row(
|
|
211
|
+
ws, table_cols: tuple[int, int], old_bottom: int, new_row: int
|
|
212
|
+
) -> None:
|
|
213
|
+
cf = getattr(ws, "conditional_formatting", None)
|
|
214
|
+
if cf is None:
|
|
215
|
+
return
|
|
216
|
+
|
|
217
|
+
t_min_c, t_max_c = table_cols
|
|
218
|
+
entries = _cf_entries(cf)
|
|
219
|
+
if not entries:
|
|
220
|
+
return
|
|
221
|
+
|
|
222
|
+
for key, rules in entries:
|
|
223
|
+
range_strs = _key_ranges(key)
|
|
224
|
+
new_ranges, changed = _extend_ranges(
|
|
225
|
+
range_strs, t_min_c, t_max_c, old_bottom, new_row
|
|
226
|
+
)
|
|
227
|
+
if not changed:
|
|
228
|
+
continue
|
|
229
|
+
# replace mapping (do NOT mutate key)
|
|
230
|
+
cf._cf_rules.pop(key, None)
|
|
231
|
+
sqref = ",".join(new_ranges)
|
|
232
|
+
for rule in rules:
|
|
233
|
+
cf.add(sqref, copy(rule))
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def _extend_dv_for_new_row(ws, table_cols: tuple[int, int], new_row: int) -> None:
|
|
237
|
+
"""For each DataValidation object, if any of its ranges overlap table columns, add
|
|
238
|
+
the new_row segment for the overlapping columns to that DV."""
|
|
239
|
+
t_min_c, t_max_c = table_cols
|
|
240
|
+
dv_list = getattr(ws, "data_validations", None)
|
|
241
|
+
if not dv_list:
|
|
242
|
+
return
|
|
243
|
+
|
|
244
|
+
dvals = getattr(dv_list, "dataValidation", None)
|
|
245
|
+
if not dvals:
|
|
246
|
+
return
|
|
247
|
+
|
|
248
|
+
for dv in dvals:
|
|
249
|
+
sqref = getattr(dv, "sqref", None) or getattr(dv, "ranges", None)
|
|
250
|
+
if not sqref:
|
|
251
|
+
continue
|
|
252
|
+
try:
|
|
253
|
+
rngs = list(sqref.ranges) # MultiCellRange -> list[CellRange]
|
|
254
|
+
except Exception:
|
|
255
|
+
rngs = []
|
|
256
|
+
|
|
257
|
+
to_add_segments: list[str] = []
|
|
258
|
+
for cr in rngs:
|
|
259
|
+
c1, r1, c2, r2 = range_boundaries(str(cr))
|
|
260
|
+
# If DV already covers the new row in this segment, skip
|
|
261
|
+
if r1 <= new_row <= r2:
|
|
262
|
+
continue
|
|
263
|
+
# Column overlap with the table
|
|
264
|
+
oc1 = max(c1, t_min_c)
|
|
265
|
+
oc2 = min(c2, t_max_c)
|
|
266
|
+
if oc1 > oc2:
|
|
267
|
+
continue
|
|
268
|
+
to_add_segments.append(
|
|
269
|
+
f"{get_column_letter(oc1)}{new_row}:{get_column_letter(oc2)}{new_row}"
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
for seg in to_add_segments:
|
|
273
|
+
dv.add(seg)
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _first_blank_data_row(
|
|
277
|
+
ws: Worksheet, tbl: ExcelTable, colmap: dict[str, int], formulas: dict[str, str]
|
|
278
|
+
) -> int | None:
|
|
279
|
+
"""Return the first data-row index (0-based worksheet row) that is blank across all
|
|
280
|
+
*non-formula* columns.
|
|
281
|
+
|
|
282
|
+
None if no such row.
|
|
283
|
+
"""
|
|
284
|
+
min_c, min_r, max_c, max_r = range_boundaries(tbl.ref)
|
|
285
|
+
data_start = min_r + 1
|
|
286
|
+
if data_start > max_r:
|
|
287
|
+
return None
|
|
288
|
+
|
|
289
|
+
# quick reverse map: table-index(1..n) -> header_lower
|
|
290
|
+
idx_to_header = {v: k for k, v in colmap.items()}
|
|
291
|
+
ncols = max_c - min_c + 1
|
|
292
|
+
|
|
293
|
+
for r in range(data_start, max_r + 1):
|
|
294
|
+
all_blank = True
|
|
295
|
+
for j_idx in range(1, ncols + 1):
|
|
296
|
+
header_lower = idx_to_header.get(j_idx, "")
|
|
297
|
+
if header_lower in formulas:
|
|
298
|
+
# ignore formula columns when deciding "blank"
|
|
299
|
+
continue
|
|
300
|
+
cell = ws.cell(r, min_c + j_idx - 1)
|
|
301
|
+
val = cell.value
|
|
302
|
+
if val is None:
|
|
303
|
+
continue
|
|
304
|
+
if isinstance(val, str) and val.strip() == "":
|
|
305
|
+
continue
|
|
306
|
+
# any non-empty value in a non-formula column -> row not blank
|
|
307
|
+
all_blank = False
|
|
308
|
+
break
|
|
309
|
+
if all_blank:
|
|
310
|
+
return r
|
|
311
|
+
return None
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def _build_row_list_from_mapping(
|
|
315
|
+
tbl: ExcelTable,
|
|
316
|
+
colmap: Dict[str, int],
|
|
317
|
+
formulas: Dict[str, str],
|
|
318
|
+
mapping: Mapping[str, Any],
|
|
319
|
+
) -> List[Any]:
|
|
320
|
+
"""Build a full-width list for add_row() from a {header->value} mapping (CI).
|
|
321
|
+
|
|
322
|
+
Values for formula columns are ignored by add_row anyway; we leave them None.
|
|
323
|
+
"""
|
|
324
|
+
min_c, min_r, max_c, max_r = range_boundaries(tbl.ref)
|
|
325
|
+
ncols = max_c - min_c + 1
|
|
326
|
+
|
|
327
|
+
# normalise input keys to lower-case once
|
|
328
|
+
src = {str(k).strip().lower(): v for k, v in mapping.items()}
|
|
329
|
+
# start as all None
|
|
330
|
+
row_list: List[Any] = [None] * ncols
|
|
331
|
+
|
|
332
|
+
# inverse map: table index -> header lower
|
|
333
|
+
idx_to_header = {v: k for k, v in colmap.items()}
|
|
334
|
+
|
|
335
|
+
for j_idx in range(1, ncols + 1):
|
|
336
|
+
header_lower = idx_to_header.get(j_idx, "")
|
|
337
|
+
if not header_lower:
|
|
338
|
+
continue
|
|
339
|
+
# skip formula columns; add_row will write formulas
|
|
340
|
+
if header_lower in formulas:
|
|
341
|
+
continue
|
|
342
|
+
if header_lower in src:
|
|
343
|
+
row_list[j_idx - 1] = src[header_lower]
|
|
344
|
+
return row_list
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def _norm_label(s: object) -> str:
|
|
348
|
+
"""Normalise a cell label for case-insensitive comparison."""
|
|
349
|
+
if s is None:
|
|
350
|
+
return ""
|
|
351
|
+
txt = str(s).replace("\n", " ").replace("\r", " ").strip()
|
|
352
|
+
# ignore a single trailing colon, collapse inner whitespace
|
|
353
|
+
if txt.endswith(":"):
|
|
354
|
+
txt = txt[:-1]
|
|
355
|
+
return " ".join(txt.split()).casefold()
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def _find_label_cell(ws: Worksheet, label: str) -> tuple[int, int]:
|
|
359
|
+
"""Find the cell coordinates (row, col) whose text matches `label` case-
|
|
360
|
+
insensitively (ignoring a trailing colon).
|
|
361
|
+
|
|
362
|
+
Scans the used range only. Raises DataDictionaryError if not found.
|
|
363
|
+
"""
|
|
364
|
+
target = _norm_label(label)
|
|
365
|
+
min_row, min_col = ws.min_row, ws.min_column
|
|
366
|
+
max_row, max_col = ws.max_row, ws.max_column
|
|
367
|
+
for r in range(min_row, max_row + 1):
|
|
368
|
+
for c in range(min_col, max_col + 1):
|
|
369
|
+
if _norm_label(ws.cell(r, c).value) == target:
|
|
370
|
+
return r, c
|
|
371
|
+
raise DataDictionaryError(f"Details label not found: {label!r}")
|