valediction 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. valediction/__init__.py +8 -0
  2. valediction/convenience.py +50 -0
  3. valediction/data_types/__init__.py +0 -0
  4. valediction/data_types/data_type_helpers.py +75 -0
  5. valediction/data_types/data_types.py +58 -0
  6. valediction/data_types/type_inference.py +541 -0
  7. valediction/datasets/__init__.py +0 -0
  8. valediction/datasets/datasets.py +870 -0
  9. valediction/datasets/datasets_helpers.py +46 -0
  10. valediction/demo/DEMO - Data Dictionary.xlsx +0 -0
  11. valediction/demo/DEMOGRAPHICS.csv +101 -0
  12. valediction/demo/DIAGNOSES.csv +650 -0
  13. valediction/demo/LAB_TESTS.csv +1001 -0
  14. valediction/demo/VITALS.csv +1001 -0
  15. valediction/demo/__init__.py +6 -0
  16. valediction/demo/demo_dictionary.py +129 -0
  17. valediction/dictionary/__init__.py +0 -0
  18. valediction/dictionary/exporting.py +501 -0
  19. valediction/dictionary/exporting_helpers.py +371 -0
  20. valediction/dictionary/generation.py +357 -0
  21. valediction/dictionary/helpers.py +174 -0
  22. valediction/dictionary/importing.py +494 -0
  23. valediction/dictionary/integrity.py +37 -0
  24. valediction/dictionary/model.py +582 -0
  25. valediction/dictionary/template/PROJECT - Data Dictionary.xltx +0 -0
  26. valediction/exceptions.py +22 -0
  27. valediction/integrity.py +97 -0
  28. valediction/io/__init__.py +0 -0
  29. valediction/io/csv_readers.py +307 -0
  30. valediction/progress.py +206 -0
  31. valediction/support.py +72 -0
  32. valediction/validation/__init__.py +0 -0
  33. valediction/validation/helpers.py +315 -0
  34. valediction/validation/issues.py +280 -0
  35. valediction/validation/validation.py +598 -0
  36. valediction-1.0.0.dist-info/METADATA +15 -0
  37. valediction-1.0.0.dist-info/RECORD +38 -0
  38. valediction-1.0.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,598 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import timedelta
4
+ from pathlib import Path
5
+ from typing import Iterator
6
+
7
+ import numpy as np
8
+ from pandas import DataFrame, Series
9
+
10
+ from valediction.data_types.data_type_helpers import (
11
+ infer_datetime_format,
12
+ )
13
+ from valediction.data_types.data_types import DataType
14
+ from valediction.datasets.datasets_helpers import DataLike, DatasetItemLike
15
+ from valediction.dictionary.model import Table
16
+ from valediction.exceptions import DataDictionaryImportError, DataIntegrityError
17
+ from valediction.io.csv_readers import (
18
+ CsvReadConfig,
19
+ FrameChunk,
20
+ iter_csv_chunks,
21
+ )
22
+ from valediction.progress import Progress
23
+ from valediction.support import _get_runtime_string, calculate_runtime
24
+ from valediction.validation.helpers import (
25
+ _column_has_values,
26
+ _set_nulls,
27
+ create_pk_hashes,
28
+ invalid_mask_date,
29
+ invalid_mask_datetime,
30
+ invalid_mask_float,
31
+ invalid_mask_integer,
32
+ invalid_mask_text_forbidden_characters,
33
+ invalid_mask_text_too_long,
34
+ mask_to_ranges,
35
+ pk_contains_whitespace_mask,
36
+ )
37
+ from valediction.validation.issues import Issues, IssueType, Range
38
+
39
+ IMPORTING_DATA = "Importing data"
40
+ SINGLE_STEPS: int = 3 # tweak if adding/amending step tracking
41
+ CHUNK_STEPS: int = 12 # tweak if adding/amending step tracking
42
+
43
+
44
+ class Validator:
45
+ """
46
+ Summary:
47
+ Validates a dataset against a dictionary.
48
+
49
+ Arguments:
50
+ dataset_item (DatasetItemLike): dataset item to validate
51
+ table_dictionary (Table): table dictionary to validate against
52
+ feedback (bool): whether to provide feedback on validation (default: True)
53
+ chunk_size (int): size of chunks to validate (default: 10_000_000)
54
+
55
+ Raises:
56
+ DataDictionaryImportError: if the provided dictionary is invalid
57
+ DataIntegrityError: if the validated dataset contains invalid values
58
+ """
59
+
60
+ def __init__(
61
+ self,
62
+ dataset_item: DatasetItemLike,
63
+ table_dictionary: Table,
64
+ feedback: bool = True,
65
+ chunk_size: int = 10_000_000,
66
+ _padding: int = 0,
67
+ ):
68
+ # User Variables
69
+ self.dataset_item = dataset_item
70
+ self.data: DataLike = dataset_item.data
71
+ self.table_dictionary: Table = table_dictionary
72
+ self.chunk_size: int = chunk_size
73
+ self.feedback: bool = feedback
74
+ self._padding: int = _padding
75
+
76
+ # Config
77
+ self.table_name: str = None
78
+ self.issues: Issues = None
79
+ self.csv_cfg: CsvReadConfig = None
80
+
81
+ # Validation Tracking
82
+ self.tracker_seen_non_nulls: dict[str, bool] = {}
83
+ self.tracker_pk_hashes: dict[int, int] = {}
84
+ self.tracker_pk_reported_first: set[int] = set()
85
+ self._dt_format_cache: dict[str, str | None] = {}
86
+ self._dt_needs_infer: set[str] = set()
87
+
88
+ # Helpers
89
+ self._column_names: set = set(self.table_dictionary.get_column_names())
90
+
91
+ # Progress Tracking
92
+ self.progress: Progress | None = None
93
+ self.est_chunk_count: int = None
94
+ self._runtimes: dict[str, timedelta] = None
95
+
96
+ # Setup
97
+ self.__check_dictionary()
98
+ self.__init_issues()
99
+ self.__init_csv_cfg()
100
+ self.__reset_pk_trackers()
101
+ self.__import_datetime_format_cache()
102
+
103
+ # Properties
104
+ @property
105
+ def data_is_path(self) -> bool:
106
+ return isinstance(self.data, (Path, str))
107
+
108
+ @property
109
+ def data_is_dataframe(self) -> bool:
110
+ return isinstance(self.data, DataFrame)
111
+
112
+ @property
113
+ def runtimes(self) -> dict[str, str]:
114
+ return {
115
+ step: _get_runtime_string(time_delta)
116
+ for step, time_delta in self._runtimes.items()
117
+ }
118
+
119
+ # Initialisation
120
+ def __check_dictionary(self):
121
+ if self.table_dictionary is None or not isinstance(
122
+ self.table_dictionary, Table
123
+ ):
124
+ raise DataDictionaryImportError(
125
+ "Data Dictionary not yet imported or generated. "
126
+ + "Validation must first have a Data Dictionary. "
127
+ + "Please first run DataSet.import_dictionary(), including `primary_keys`."
128
+ )
129
+
130
+ self.table_dictionary.check()
131
+ self.table_name = self.table_dictionary.name
132
+
133
+ def __init_issues(self):
134
+ self.issues = Issues()
135
+
136
+ def __init_csv_cfg(self):
137
+ self.csv_cfg = CsvReadConfig()
138
+
139
+ def __reset_pk_trackers(self):
140
+ self.tracker_pk_hashes: dict[int, int] = {}
141
+ self.tracker_pk_reported_first: set[int] = set()
142
+
143
+ def __import_datetime_format_cache(self) -> None:
144
+ self._dt_format_cache.clear()
145
+ self._dt_needs_infer.clear()
146
+
147
+ for column in self.table_dictionary:
148
+ name = column.name
149
+ datetime_format = column.datetime_format
150
+ data_type = column.data_type
151
+
152
+ if data_type in (DataType.DATE, DataType.DATETIME):
153
+ self._dt_format_cache[name] = datetime_format
154
+
155
+ if not datetime_format:
156
+ self._dt_needs_infer.add(name)
157
+
158
+ # Validate
159
+ def validate(self):
160
+ """
161
+ Summary:
162
+ Validate the dataset against the data dictionary
163
+
164
+ Raises:
165
+ DataDictionaryImportError: if the data dictionary has not yet been imported or generated
166
+ DataDictionaryError: if the data dictionary contains invalid data
167
+ ValueError: if the dataset contains invalid data
168
+ """
169
+ self.__progress_init()
170
+ first_chunk = True
171
+
172
+ for chunk in self.__iterate_data_chunks(
173
+ self.data, self.chunk_size, self.csv_cfg
174
+ ):
175
+ df = chunk.df
176
+ start = chunk.start
177
+
178
+ # First Chunk Only
179
+ if first_chunk:
180
+ _ = self._check_for_missing_columns(df)
181
+ _ = self._check_for_extra_columns(df)
182
+ first_chunk = False
183
+
184
+ # Remove Nulls
185
+ df = self._set_nulls(df)
186
+
187
+ # Structural Checks
188
+ self._check_for_column_nulls(df)
189
+ self._check_primary_key_whitespace(df, start_row=start)
190
+ self._check_primary_key_integrity(df, start_row=start)
191
+
192
+ # Data Type Checks
193
+ self._infer_datetime_formats(df)
194
+ self._check_column_types(df, start_row=start)
195
+ self._check_text_lengths(df, start_row=start)
196
+ self._check_text_forbidden_chars(df, start_row=start)
197
+
198
+ # Final Checks
199
+ self._check_for_fully_null_column()
200
+
201
+ # Finish:
202
+ self.__reset_pk_trackers()
203
+ self.__finish_validation()
204
+ if self.issues:
205
+ raise DataIntegrityError(self.issues)
206
+
207
+ # Global Helpers
208
+
209
+ def __iterate_data_chunks(
210
+ self,
211
+ data: Path | str | DataFrame,
212
+ chunk_size: int | None,
213
+ csv_config: CsvReadConfig | None = None,
214
+ ) -> Iterator[FrameChunk]:
215
+ """Yield FrameChunk and keep a running estimate of total chunks for the progress
216
+ bar."""
217
+ csv_config = csv_config or CsvReadConfig()
218
+ # In-memory DataFrame: single chunk
219
+ if isinstance(data, DataFrame):
220
+ self.__begin_step(step=IMPORTING_DATA)
221
+ n = len(data)
222
+ if n == 0:
223
+ self.__complete_step()
224
+ return
225
+
226
+ # One chunk only
227
+ self.__progress_retarget_total(est_chunk_count=1)
228
+
229
+ self.__complete_step()
230
+ yield FrameChunk(
231
+ df=data,
232
+ start=0,
233
+ end=n - 1,
234
+ total_size=None,
235
+ file_pos=None,
236
+ bytes_read=None,
237
+ chunk_index=1,
238
+ total_bytes_read=None,
239
+ total_chunks_seen=None,
240
+ )
241
+ return
242
+
243
+ # Path/str: chunking
244
+
245
+ iterator = iter_csv_chunks(
246
+ path=Path(data), chunk_size=chunk_size, cfg=csv_config
247
+ )
248
+ while True:
249
+ self.__begin_step(step=IMPORTING_DATA)
250
+ try:
251
+ chunk = next(iterator)
252
+ except StopIteration:
253
+ break
254
+
255
+ est_chunk_count = chunk.estimate_chunk_count()
256
+ self.__progress_retarget_total(est_chunk_count=est_chunk_count)
257
+
258
+ # Bookkeeping & yield
259
+ self.__complete_step()
260
+ yield chunk
261
+
262
+ # Finder Helpers
263
+ def _find_data_type(self, column_name: str) -> DataType:
264
+ return self.table_dictionary.get_column(column_name).data_type
265
+
266
+ def _find_datetime_format(self, column_name: str) -> str:
267
+ return self.table_dictionary.get_column(column_name).datetime_format
268
+
269
+ def _find_max_length(self, column_name: str) -> int | None:
270
+ return self.table_dictionary.get_column(column_name).length
271
+
272
+ # Validation: Start Helpers
273
+ def _check_for_missing_columns(self, df: DataFrame):
274
+ self.__begin_step(step="Checking for missing columns")
275
+ missing = self._column_names - set(df.columns)
276
+ if missing:
277
+ for column in missing:
278
+ self.issues.add(
279
+ issue_type=IssueType.MISSING_COLUMN,
280
+ table=self.table_name,
281
+ column=column,
282
+ parent=self.dataset_item,
283
+ )
284
+ self.__complete_step()
285
+
286
+ def _check_for_extra_columns(self, df: DataFrame):
287
+ self.__begin_step(step="Checking for extra columns")
288
+ extra = set(df.columns) - self._column_names
289
+ if extra:
290
+ for column in extra:
291
+ self.issues.add(
292
+ issue_type=IssueType.EXTRA_COLUMN,
293
+ table=self.table_name,
294
+ column=column,
295
+ parent=self.dataset_item,
296
+ )
297
+ self.__complete_step()
298
+
299
+ # Validation: Chunk Helpers
300
+ def _set_nulls(self, df: DataFrame) -> DataFrame:
301
+ self.__begin_step(step="Setting nulls")
302
+ df = _set_nulls(df)
303
+ self.__complete_step()
304
+ return df
305
+
306
+ def _check_for_column_nulls(self, df: DataFrame) -> None:
307
+ self.__begin_step(step="Checking for column nulls")
308
+ for column in df.columns:
309
+ # Check if previously checked and found
310
+ seen_or_found = self.tracker_seen_non_nulls.get(column, False)
311
+ if not seen_or_found:
312
+ self.tracker_seen_non_nulls[column] = _column_has_values(df[column])
313
+ self.__complete_step()
314
+
315
+ def _check_primary_key_whitespace(self, df: DataFrame, start_row: int) -> None:
316
+ pk_cols = self.table_dictionary.get_primary_keys()
317
+ if not pk_cols:
318
+ return
319
+
320
+ # Check for whitespace (text cols only)
321
+ self.__begin_step(step="Checking for primary key whitespace")
322
+ pk_cols_text = []
323
+ for column in self.table_dictionary:
324
+ if column.name in pk_cols and column.data_type in [DataType.TEXT]:
325
+ pk_cols_text.append(column.name)
326
+
327
+ if pk_cols_text:
328
+ space_mask = pk_contains_whitespace_mask(df[pk_cols_text])
329
+ if space_mask.any():
330
+ self.issues.add(
331
+ issue_type=IssueType.PK_WHITESPACE,
332
+ table=self.table_name,
333
+ column=None,
334
+ ranges=mask_to_ranges(space_mask, start_row),
335
+ parent=self.dataset_item,
336
+ )
337
+ self.__complete_step()
338
+
339
+ def _check_primary_key_integrity(self, df, start_row: int) -> None:
340
+ pk_cols = self.table_dictionary.get_primary_keys()
341
+ if not pk_cols:
342
+ return
343
+
344
+ # Create primary key hashes
345
+ self.__begin_step(step="Creating primary key hashes")
346
+ pk_hashes = create_pk_hashes(df[pk_cols])
347
+ self.__complete_step()
348
+
349
+ # Primary Key Nulls
350
+ self.__begin_step(step="Checking for primary key nulls")
351
+ null = pk_hashes.isna()
352
+ non_null = ~null
353
+ pk_hashes_non_null = pk_hashes[non_null]
354
+
355
+ if null.any():
356
+ self.issues.add(
357
+ IssueType.PK_NULL,
358
+ table=self.table_name,
359
+ column=None,
360
+ ranges=mask_to_ranges(null, start_row),
361
+ parent=self.dataset_item,
362
+ )
363
+ self.__complete_step()
364
+
365
+ # 2) In-chunk collisions
366
+ self.__begin_step(step="Checking for primary key collision")
367
+
368
+ codes, uniques = pk_hashes_non_null.factorize(sort=False)
369
+ counts = np.bincount(codes, minlength=len(uniques))
370
+ in_chunk_local = counts[codes] > 1
371
+ in_chunk_collision = non_null.copy()
372
+ in_chunk_collision.loc[non_null] = in_chunk_local
373
+
374
+ # 3) Cross-chunk collisions
375
+ seen_before = set(self.tracker_pk_hashes)
376
+ unique_in_seen = np.fromiter(
377
+ (unique in seen_before for unique in uniques),
378
+ dtype=bool,
379
+ count=len(uniques),
380
+ )
381
+ seen_before_local = unique_in_seen[codes]
382
+ cross_chunk_local = seen_before_local & ~in_chunk_local
383
+ cross_chunk_collision = non_null.copy()
384
+ cross_chunk_collision.loc[non_null] = cross_chunk_local
385
+
386
+ # 4) Valid in-chunk PKs:
387
+ first_in_chunk_local = ~pk_hashes_non_null.duplicated(keep="first")
388
+ first_appearance_local = first_in_chunk_local & ~seen_before_local
389
+
390
+ # 7) Emit in-chunk collisions Issues
391
+ if in_chunk_collision.any():
392
+ self.issues.add(
393
+ IssueType.PK_COLLISION,
394
+ table=self.table_name,
395
+ column=None,
396
+ ranges=mask_to_ranges(in_chunk_collision, start_row),
397
+ parent=self.dataset_item,
398
+ )
399
+
400
+ # 7) Emit cross-chunk collisions Issues
401
+ if cross_chunk_collision.any():
402
+ self.issues.add(
403
+ IssueType.PK_COLLISION,
404
+ table=self.table_name,
405
+ column=None,
406
+ ranges=mask_to_ranges(cross_chunk_collision, start_row),
407
+ parent=self.dataset_item,
408
+ )
409
+
410
+ # Add the original PK row as a collision
411
+ for h in pk_hashes_non_null[cross_chunk_local].unique():
412
+ if h not in self.tracker_pk_reported_first:
413
+ first_row = self.tracker_pk_hashes[int(h)]
414
+ self.issues.add(
415
+ IssueType.PK_COLLISION,
416
+ table=self.table_name,
417
+ column=None,
418
+ ranges=[Range(first_row, first_row)],
419
+ parent=self.dataset_item,
420
+ )
421
+ self.tracker_pk_reported_first.add(int(h))
422
+ self.__complete_step()
423
+
424
+ # 7) Record valid PKs
425
+ self.__begin_step(step="Caching primary keys")
426
+ if first_appearance_local.any():
427
+ pos = np.flatnonzero(first_appearance_local.to_numpy())
428
+ vals = pk_hashes_non_null.to_numpy()[pos]
429
+ start_rows = start_row + pos
430
+ for h, r in zip(vals, start_rows, strict=False):
431
+ self.tracker_pk_hashes.setdefault(int(h), int(r))
432
+ self.__complete_step()
433
+
434
+ def _infer_datetime_formats(self, df: DataFrame) -> None:
435
+ self.__begin_step(step="Inferring datetime formats")
436
+ if not self._dt_needs_infer:
437
+ self.__complete_step()
438
+ return
439
+
440
+ columns = [col for col in self._dt_needs_infer if col in df.columns]
441
+ if not columns:
442
+ self.__complete_step()
443
+ return
444
+
445
+ for column in columns:
446
+ series = df[column].astype("string", copy=False).str.strip()
447
+ unique = series.dropna().unique()
448
+ if len(unique) == 0:
449
+ continue
450
+
451
+ try:
452
+ fmt_or_false = infer_datetime_format(Series(unique, dtype="string"))
453
+ except ValueError:
454
+ # ambiguous - try again in later chunk
455
+ continue
456
+
457
+ if fmt_or_false and fmt_or_false is not False:
458
+ col_dtype = self._find_data_type(column)
459
+ from valediction.validation.helpers import _allowed_formats_for
460
+
461
+ allowed = _allowed_formats_for(col_dtype)
462
+ if fmt_or_false in allowed:
463
+ self._dt_format_cache[column] = fmt_or_false
464
+ self._dt_needs_infer.discard(column)
465
+
466
+ # Persist in the dictionary
467
+ try:
468
+ self.table_dictionary.get_column(
469
+ column
470
+ ).datetime_format = fmt_or_false
471
+ except Exception:
472
+ pass
473
+ self.__complete_step()
474
+
475
+ def _check_column_types(self, df: DataFrame, start_row: int) -> None:
476
+ self.__begin_step(step="Checking column types")
477
+ present = [col for col in df.columns if col in self._column_names]
478
+ for col in present:
479
+ dtype = self._find_data_type(col)
480
+ if dtype == DataType.TEXT:
481
+ continue
482
+
483
+ series = df[col]
484
+ if dtype == DataType.INTEGER:
485
+ invalid = invalid_mask_integer(series)
486
+ elif dtype == DataType.FLOAT:
487
+ invalid = invalid_mask_float(series)
488
+ elif dtype == DataType.DATE:
489
+ fmt = self._dt_format_cache.get(col) or self._find_datetime_format(col)
490
+ invalid = invalid_mask_date(series, fmt)
491
+ elif dtype == DataType.DATETIME:
492
+ fmt = self._dt_format_cache.get(col) or self._find_datetime_format(col)
493
+ invalid = invalid_mask_datetime(series, fmt)
494
+ else:
495
+ continue
496
+
497
+ if invalid.any():
498
+ self.issues.add(
499
+ IssueType.TYPE_MISMATCH,
500
+ table=self.table_name,
501
+ column=col,
502
+ ranges=mask_to_ranges(invalid, start_row),
503
+ parent=self.dataset_item,
504
+ )
505
+ self.__complete_step()
506
+
507
+ def _check_text_lengths(self, df: DataFrame, start_row: int) -> None:
508
+ self.__begin_step(step="Checking text lengths")
509
+ present = [col for col in df.columns if col in self._column_names]
510
+ for col in present:
511
+ if self._find_data_type(col) != DataType.TEXT:
512
+ continue
513
+ max_len = self._find_max_length(col)
514
+ invalid = invalid_mask_text_too_long(df[col], max_len)
515
+ if invalid.any():
516
+ self.issues.add(
517
+ IssueType.TEXT_TOO_LONG,
518
+ table=self.table_name,
519
+ column=col,
520
+ ranges=mask_to_ranges(invalid, start_row),
521
+ parent=self.dataset_item,
522
+ )
523
+ self.__complete_step()
524
+
525
+ def _check_text_forbidden_chars(self, df: DataFrame, start_row: int) -> None:
526
+ self.__begin_step(step="Checking for forbidden characters")
527
+ present = [col for col in df.columns if col in self._column_names]
528
+ for col in present:
529
+ if self._find_data_type(col) != DataType.TEXT:
530
+ continue
531
+ mask = invalid_mask_text_forbidden_characters(df[col])
532
+ if mask.any():
533
+ self.issues.add(
534
+ IssueType.FORBIDDEN_CHARACTER,
535
+ table=self.table_name,
536
+ column=col,
537
+ ranges=mask_to_ranges(mask, start_row),
538
+ parent=self.dataset_item,
539
+ )
540
+ self.__complete_step()
541
+
542
+ # Validation: Final Helpers
543
+ def _check_for_fully_null_column(self):
544
+ self.__begin_step(step="Checking for fully null columns")
545
+ for column, seen in self.tracker_seen_non_nulls.items():
546
+ if not seen:
547
+ self.issues.add(
548
+ issue_type=IssueType.FULLY_NULL_COLUMN,
549
+ table=self.table_name,
550
+ column=column,
551
+ parent=self.dataset_item,
552
+ )
553
+ self.__complete_step()
554
+
555
+ # Progress Helpers
556
+ def __progress_init(self) -> None:
557
+ if not self.feedback:
558
+ self.progress = Progress(enabled=False)
559
+ return
560
+
561
+ total_steps = (
562
+ (SINGLE_STEPS + CHUNK_STEPS)
563
+ if (isinstance(self.data, DataFrame) or not self.chunk_size)
564
+ else None
565
+ )
566
+ self.est_chunk_count = None
567
+ pad = " " * self._padding if self._padding else ""
568
+
569
+ self.progress = Progress(
570
+ desc=f"Validating {self.table_name}: {pad}",
571
+ starting_step=IMPORTING_DATA,
572
+ est_total=total_steps,
573
+ smoothing_steps=CHUNK_STEPS,
574
+ )
575
+
576
+ def __progress_retarget_total(self, est_chunk_count: int) -> None:
577
+ """Once est_chunk_count is known, resize the bar without losing progress."""
578
+ if est_chunk_count != self.est_chunk_count:
579
+ self.est_chunk_count = est_chunk_count
580
+ new_total = SINGLE_STEPS + (CHUNK_STEPS * self.est_chunk_count)
581
+ self.progress.retarget_total(new_total=new_total)
582
+
583
+ def __finish_validation(self) -> None:
584
+ completed = "Completed with issues" if self.issues else "Completed"
585
+ step = (
586
+ f"{completed} ({calculate_runtime(start=self.progress.full_start).message})"
587
+ )
588
+ save_as = "Total"
589
+ good = False if self.issues else True
590
+ self.progress.finish(postfix=step, save_as=save_as, good=good)
591
+ self._runtimes = self.progress.runtimes
592
+ self.progress.close()
593
+
594
+ def __begin_step(self, step: str | None = None) -> None:
595
+ self.progress.begin_step(step=step)
596
+
597
+ def __complete_step(self) -> None:
598
+ self.progress.complete_step()
@@ -0,0 +1,15 @@
1
+ Metadata-Version: 2.4
2
+ Name: valediction
3
+ Version: 1.0.0
4
+ Summary: Valediction is a convenience data validation package that allows generation, import, and constraint enforcement of user-defined data dictionaries against datasets.
5
+ Author-email: Cai Davis <Cai.Davis@uhs.nhs.uk>
6
+ Requires-Python: <4.0,>=3.11
7
+ Requires-Dist: certifi<2025,>=2024.2.2
8
+ Requires-Dist: mohawk<2,>=1.1.0
9
+ Requires-Dist: openpyxl<4,>=3.1.5
10
+ Requires-Dist: pandas<3,>=2.2.1
11
+ Requires-Dist: pydantic<3,>=2.11.4
12
+ Requires-Dist: requests<3,>=2.31.0
13
+ Requires-Dist: tabulate<0.10,>=0.9.0
14
+ Requires-Dist: tqdm>=4.67.1
15
+ Requires-Dist: xlsxwriter<4,>=3.2.3
@@ -0,0 +1,38 @@
1
+ valediction/__init__.py,sha256=Bv2Nd-6bolTfToLT2s4NvRsj42ZH2Ls4rLApet5ai1E,519
2
+ valediction/convenience.py,sha256=_N7LFaCRQC8xENpu12vxyPIrpFpR0XA-m1wUa1m7dTo,1670
3
+ valediction/exceptions.py,sha256=YVgf4nOS26lnsImXmROvLhZW-YS8srrfMC7ugsQQKkI,770
4
+ valediction/integrity.py,sha256=8TPJ2mMmb4Qm-XwPj4Fwoivt7dRJdYlWtiBw8kdjXYA,3449
5
+ valediction/progress.py,sha256=ZSigmzRiLwxhqvgUdhW70I1WANEmtCpocwkmeystZDw,6180
6
+ valediction/support.py,sha256=syAxhRpFSzSk2ZmzdO43EI-SK3QNb_45p_oDp_5TMHk,2228
7
+ valediction/data_types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ valediction/data_types/data_type_helpers.py,sha256=_VqJLCfOszjftv5OYabMLaWqGXwk6qdXoIOcqVKIYq8,2357
9
+ valediction/data_types/data_types.py,sha256=WiSdBALYeU8d_SeYJ4cxPLQWAE07L8kYwf-JwBX4yKg,1663
10
+ valediction/data_types/type_inference.py,sha256=QIdFtR5niG90jUW0UYi0ymffDgT7KMAxaHmZpkI1jlY,20493
11
+ valediction/datasets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ valediction/datasets/datasets.py,sha256=-LvecqxkudnV83SBxd2HAgCJzRe5DXJQGoTCN4KvzNk,30890
13
+ valediction/datasets/datasets_helpers.py,sha256=uZUhXA8G0FA4YrItk8JAJzZp0j84bfWBOBaWJHMroRY,1205
14
+ valediction/demo/DEMO - Data Dictionary.xlsx,sha256=wj1JG8dHgdALVwV0zSSYnyWMomMTzrHxGFRm491wM_A,45308
15
+ valediction/demo/DEMOGRAPHICS.csv,sha256=VY3JwQeNotDxFi47MPVp2s0MxfvP9c8g7Wgh2Lmx8dY,3538
16
+ valediction/demo/DIAGNOSES.csv,sha256=21Eb4RQCiCuLNYMqYx_Kq1ov61yDq94WJGzyGnJpMMs,20434
17
+ valediction/demo/LAB_TESTS.csv,sha256=cVuq64DAhdulIy7L3eGXc3PMllkGR4cedhcQgOGgC08,81402
18
+ valediction/demo/VITALS.csv,sha256=q32tIvEB3a6I1vZPKzDZBbYNTld_SUZddX4vse0oX24,47644
19
+ valediction/demo/__init__.py,sha256=y5gM8K1zmFyZgEIQSw4mTGd4F7CsX99Ekio8kp9lBmw,206
20
+ valediction/demo/demo_dictionary.py,sha256=XK-6OVp0vBkWz6TSfpbE_3jZ9cGB1Dgxli2EpsHr5Iw,4539
21
+ valediction/dictionary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
+ valediction/dictionary/exporting.py,sha256=Kg0boxNBYV4w0uEH7TXN2YumQVIZKMStY158QI_6yFs,19118
23
+ valediction/dictionary/exporting_helpers.py,sha256=FP-4Hn90FwvdrjJjKUvvPDC-pIJMmZQANyPQ4eZLN4k,12805
24
+ valediction/dictionary/generation.py,sha256=SMD_h02WAzHDTF-J5HGLHOYPNVzi3Km3jEm_6jSjg14,12707
25
+ valediction/dictionary/helpers.py,sha256=mN_y1tK8Zo6gKqWGqrKMO1zGTjxG16VP2QMeheucxO4,5634
26
+ valediction/dictionary/importing.py,sha256=boXNyMMqHPj1HUzlu7otVh45dmBqbmsm35Y9NyH3ZvU,19335
27
+ valediction/dictionary/integrity.py,sha256=8vgaRYBm5zmOPwQijqID4_Emt-IfL1eTWzFoDI6MExk,824
28
+ valediction/dictionary/model.py,sha256=fFsIh-L7vB1qcVB3aoB76GJBOsmoDOaAhNSRz3ari_I,19689
29
+ valediction/dictionary/template/PROJECT - Data Dictionary.xltx,sha256=ZsWmJsSBHvh3ADfrntmeVMWI9Vp_q7zqrTgp7rGd-AI,41721
30
+ valediction/io/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
+ valediction/io/csv_readers.py,sha256=oxrnU8s-HsXLu-wvDZbwY-UJ6X8RFhTiUdFutxhhXLQ,10137
32
+ valediction/validation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
+ valediction/validation/helpers.py,sha256=RXJwPsZcwzUQqOYB1LlW2wIluHUgILyVRJNqBf-21ig,10495
34
+ valediction/validation/issues.py,sha256=cMBt2iWIzqXrsdlhHIp1UTX_uYh54wwQdhUuhtEIT8U,9402
35
+ valediction/validation/validation.py,sha256=xZvXo0fkP5kTvuR02ZkLRf7WBjZRc1apiLSJ5oJhVhc,22146
36
+ valediction-1.0.0.dist-info/METADATA,sha256=ABQUlOE2SyNLRPTS8emHZQQFAcmAAHz3Cm7OTb5VNck,612
37
+ valediction-1.0.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
38
+ valediction-1.0.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.27.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any