valediction 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. valediction/__init__.py +8 -0
  2. valediction/convenience.py +50 -0
  3. valediction/data_types/__init__.py +0 -0
  4. valediction/data_types/data_type_helpers.py +75 -0
  5. valediction/data_types/data_types.py +58 -0
  6. valediction/data_types/type_inference.py +541 -0
  7. valediction/datasets/__init__.py +0 -0
  8. valediction/datasets/datasets.py +870 -0
  9. valediction/datasets/datasets_helpers.py +46 -0
  10. valediction/demo/DEMO - Data Dictionary.xlsx +0 -0
  11. valediction/demo/DEMOGRAPHICS.csv +101 -0
  12. valediction/demo/DIAGNOSES.csv +650 -0
  13. valediction/demo/LAB_TESTS.csv +1001 -0
  14. valediction/demo/VITALS.csv +1001 -0
  15. valediction/demo/__init__.py +6 -0
  16. valediction/demo/demo_dictionary.py +129 -0
  17. valediction/dictionary/__init__.py +0 -0
  18. valediction/dictionary/exporting.py +501 -0
  19. valediction/dictionary/exporting_helpers.py +371 -0
  20. valediction/dictionary/generation.py +357 -0
  21. valediction/dictionary/helpers.py +174 -0
  22. valediction/dictionary/importing.py +494 -0
  23. valediction/dictionary/integrity.py +37 -0
  24. valediction/dictionary/model.py +582 -0
  25. valediction/dictionary/template/PROJECT - Data Dictionary.xltx +0 -0
  26. valediction/exceptions.py +22 -0
  27. valediction/integrity.py +97 -0
  28. valediction/io/__init__.py +0 -0
  29. valediction/io/csv_readers.py +307 -0
  30. valediction/progress.py +206 -0
  31. valediction/support.py +72 -0
  32. valediction/validation/__init__.py +0 -0
  33. valediction/validation/helpers.py +315 -0
  34. valediction/validation/issues.py +280 -0
  35. valediction/validation/validation.py +598 -0
  36. valediction-1.0.0.dist-info/METADATA +15 -0
  37. valediction-1.0.0.dist-info/RECORD +38 -0
  38. valediction-1.0.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,870 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from datetime import timedelta
5
+ from pathlib import Path
6
+ from typing import Iterable, Iterator
7
+
8
+ from pandas import DataFrame
9
+
10
+ from valediction.datasets.datasets_helpers import DataLike
11
+ from valediction.dictionary.generation import Generator
12
+ from valediction.dictionary.importing import import_dictionary
13
+ from valediction.dictionary.model import Dictionary, Table
14
+ from valediction.exceptions import DataDictionaryImportError, DataIntegrityError
15
+ from valediction.io.csv_readers import (
16
+ FrameChunk,
17
+ iter_csv_chunks,
18
+ read_csv_all,
19
+ read_csv_headers,
20
+ )
21
+ from valediction.support import (
22
+ _get_runtime_string,
23
+ _normalise_name,
24
+ list_as_bullets,
25
+ print_bold_red,
26
+ print_red,
27
+ )
28
+ from valediction.validation.helpers import apply_data_types
29
+ from valediction.validation.issues import Issues
30
+ from valediction.validation.validation import Validator
31
+
32
+
33
+ @dataclass()
34
+ class DatasetItem:
35
+ """
36
+ Summary:
37
+ Represents a single table binding for validation.
38
+
39
+ Attributes:
40
+ name (str): table name
41
+ data (Path | DataFrame): DataFrame or path to csv
42
+ validated (bool): whether the table has been successfully validated
43
+ table_dictionary (Table | None): dictionary Table object for the DatasetItem
44
+ validator (Validator): validator object
45
+ issues (Issues): contains validation issues/deviations from the dictionary
46
+ validation_runtimes (dict[str, str]): validation runtime
47
+ dictionary_runtimes (dict[str, str]): dictionary generation runtime
48
+
49
+ Raises:
50
+ DataDictionaryImportError: if there is an issue with importing the dictionary
51
+ DataIntegrityError: if there is an issue with the integrity of the data
52
+ """
53
+
54
+ name: str
55
+ data: Path | DataFrame
56
+ validated: bool = False
57
+ table_dictionary: Table | None = None
58
+ validator: Validator = None
59
+ issues: Issues = None
60
+ _validation_runtimes: dict[str, timedelta] = None
61
+ _dictionary_runtimes: dict[str, timedelta] = None
62
+ _padding: int = 0
63
+
64
+ def __post_init__(self):
65
+ object.__setattr__(self, "issues", Issues())
66
+
67
+ # Properties
68
+ @property
69
+ def validation_runtimes(self) -> dict[str, str]:
70
+ if not self._validation_runtimes:
71
+ return {}
72
+
73
+ return {
74
+ step: _get_runtime_string(time_delta)
75
+ for step, time_delta in self._validation_runtimes.items()
76
+ }
77
+
78
+ @property
79
+ def dictionary_runtimes(self) -> dict[str, str]:
80
+ if not self._dictionary_runtimes:
81
+ return {}
82
+
83
+ return {
84
+ step: _get_runtime_string(time_delta)
85
+ for step, time_delta in self._dictionary_runtimes.items()
86
+ }
87
+
88
+ @property
89
+ def is_dataframe(self) -> bool:
90
+ return isinstance(self.data, DataFrame)
91
+
92
+ @property
93
+ def is_path(self) -> bool:
94
+ return isinstance(self.data, Path)
95
+
96
+ @property
97
+ def column_count(self) -> int:
98
+ if isinstance(self.data, DataFrame):
99
+ return self.data.shape[1]
100
+ else:
101
+ return read_csv_headers(path=self.data).shape[1]
102
+
103
+ @property
104
+ def primary_keys(self) -> list[str]:
105
+ if not self.table_dictionary:
106
+ raise DataDictionaryImportError(
107
+ "No dictionary attached to table - please import_dictionary() or generate_dictionary() first"
108
+ )
109
+
110
+ return self.table_dictionary.get_primary_keys()
111
+
112
+ @property
113
+ def headers(self) -> list[str]:
114
+ if self.is_dataframe:
115
+ return list(self.data.columns)
116
+ elif self.is_path:
117
+ return list(read_csv_headers(path=self.data).columns)
118
+ else:
119
+ raise TypeError("self.data must be a DataFrame or str/Path to .csv")
120
+
121
+ # Magic
122
+ def __repr__(self) -> str:
123
+ if isinstance(self.data, DataFrame):
124
+ shape = f"{self.data.shape[0]}x{self.data.shape[1]}"
125
+ data_repr = f"DataFrame[{shape}]"
126
+ elif isinstance(self.data, Path):
127
+ data_repr = f"Path('{self.data.name}')"
128
+ else:
129
+ data_repr = repr(self.data)
130
+ return (
131
+ f"DatasetItem(name={self.name!r}, data={data_repr}, "
132
+ f"validated={self.validated})"
133
+ )
134
+
135
+ # Validation
136
+ def validate(
137
+ self,
138
+ chunk_size: int = None,
139
+ feedback: bool = True,
140
+ ):
141
+ """
142
+ Summary:
143
+ Validates the dataset item against the dictionary.
144
+ Warns if there are issues with the integrity of the data.
145
+
146
+ Arguments:
147
+ chunk_size (int): Size of chunks for validating data to optimise RAM usage,
148
+ if reading from CSV (default: 10_000_000)
149
+ feedback (bool): Provide user feedback on progress (default: True)
150
+
151
+ Raises:
152
+ DataDictionaryImportError: if there is an issue with importing the dictionary
153
+ """
154
+ self.__check_dictionary()
155
+ validator = Validator(
156
+ dataset_item=self,
157
+ table_dictionary=self.table_dictionary,
158
+ chunk_size=chunk_size,
159
+ feedback=feedback,
160
+ _padding=self._padding,
161
+ )
162
+
163
+ object.__setattr__(self, "validator", validator)
164
+ try:
165
+ validator.validate()
166
+ object.__setattr__(self, "validated", True)
167
+ object.__setattr__(self, "issues", Issues())
168
+ if self.is_dataframe:
169
+ self.apply_dictionary()
170
+
171
+ # Issues detected
172
+ except DataIntegrityError:
173
+ object.__setattr__(self, "validated", False)
174
+ object.__setattr__(self, "issues", validator.issues)
175
+
176
+ # No Issues
177
+ else:
178
+ object.__setattr__(self, "validated", True)
179
+
180
+ finally:
181
+ object.__setattr__(self, "_validation_runtimes", validator._runtimes)
182
+
183
+ # Warn Issues
184
+ try:
185
+ self.check()
186
+ except DataIntegrityError:
187
+ pass
188
+
189
+ def check(self) -> bool:
190
+ """
191
+ Summary:
192
+ Check the validity of the DatasetItem.
193
+
194
+ Raises:
195
+ DataIntegrityError: If there is an issue with the integrity of the data, either because:
196
+ - the DatasetItem is not yet validated
197
+ - there are issues with the integrity of the data
198
+ """
199
+ error = (
200
+ f"Issues detected in {self.name}. Issues:\n{self.issues}"
201
+ if len(self.issues) > 0
202
+ else "DatasetItem not yet validated"
203
+ if not self.validated
204
+ else ""
205
+ )
206
+ if error:
207
+ print_bold_red(f"WARNING: Issues detected in {self.name}.")
208
+ print_red(f"{self.issues}")
209
+ raise DataIntegrityError(error)
210
+ else:
211
+ return True
212
+
213
+ def apply_dictionary(self):
214
+ """
215
+ Summary:
216
+ Apply a validated Data Dictionary to a validated DatasetItem.
217
+
218
+ Raises:
219
+ DataDictionaryImportError: if no Data Dictionary has been imported or generated and attached to the table
220
+ DataIntegrityError: if the data has not been validated before attempting to apply the dictionary
221
+ """
222
+ if not self.table_dictionary:
223
+ raise DataDictionaryImportError(
224
+ "No Data Dictionary imported or generated and attached to table. "
225
+ + "Please first run Dataset.import_dictionary() or Dataset.generate_dictionary() "
226
+ + " and then Dataset.validate()"
227
+ )
228
+
229
+ if not self.validated:
230
+ raise DataIntegrityError(
231
+ "Cannot apply Data Dictionary to unvalidated data. "
232
+ + "Please first run DataSet.validate() on the table."
233
+ )
234
+
235
+ if self.is_path:
236
+ self.import_data()
237
+
238
+ object.__setattr__(
239
+ self, "data", apply_data_types(self.data, self.table_dictionary)
240
+ )
241
+
242
+ # Data Import
243
+ def import_data(self):
244
+ """
245
+ Summary:
246
+ Import the data associated with this DatasetItem into memory.
247
+
248
+ Raises:
249
+ DataIntegrityError: if there is an issue with the integrity of the data
250
+ """
251
+ if self.is_dataframe:
252
+ print(f"DatasetItem '{self.name}' already imported")
253
+ return
254
+
255
+ else:
256
+ object.__setattr__(self, "data", read_csv_all(self.data).df)
257
+ if self.table_dictionary and self.validated:
258
+ self.apply_dictionary()
259
+
260
+ def iterate_data_chunks(self, chunk_size: int = 10_000_000) -> Iterator[FrameChunk]:
261
+ """
262
+ Summary:
263
+ Yields data in chunks. If `data` is a DataFrame, yields the whole DataFrame once within
264
+ a FrameChunk. If the Dataset is validated, dtypes will be applied to the DataFrame.
265
+ If not, will warn and return as strings types.
266
+
267
+ Args:
268
+ chunk_size (int, optional): chunk_size (int | None): Size of chunks for reading data to optimise RAM usage,
269
+ if reading from CSV (default: 10_000_000)
270
+
271
+ Yields:
272
+ Iterator[FrameChunk]: Iterator of FrameChunks, with each chunk containing a DataFrame as `chunk.df`
273
+ """
274
+ if not self.validated:
275
+ print_bold_red("WARNING: ", end="")
276
+ print_red(
277
+ f"DatasetItem '{self.name}' has not been validated. "
278
+ + "All data will be yielded with string dtypes."
279
+ )
280
+ if self.is_path:
281
+ for chunk in iter_csv_chunks(path=self.data, chunk_size=chunk_size):
282
+ if self.validated:
283
+ df = apply_data_types(chunk.df, self.table_dictionary)
284
+ chunk.update_df(df)
285
+ yield chunk
286
+
287
+ if self.is_dataframe:
288
+ n = len(self.data)
289
+ # apply_data_types() will already have been applied if validated
290
+ yield FrameChunk(
291
+ df=self.data,
292
+ start=0,
293
+ end=(n - 1) if n else 0,
294
+ total_size=None,
295
+ file_pos=None,
296
+ bytes_read=None,
297
+ chunk_index=1,
298
+ total_bytes_read=None,
299
+ total_chunks_seen=1,
300
+ )
301
+ return
302
+
303
+ # Data Export
304
+ def export_data(
305
+ self,
306
+ directory: Path | str,
307
+ overwrite: bool = False,
308
+ enforce_validation: bool = True,
309
+ ):
310
+ """Export DatasetItem data to csv, if imported.
311
+
312
+ Args:
313
+ directory (Path | str): Directory to export csv file.
314
+ overwrite (bool, optional): Overwrite existing file on conflict. Defaults to False.
315
+ enforce_validation (bool, optional): Raise error if unvalidated. Defaults to True.
316
+
317
+ Raises:
318
+ ValueError: If unimported, unvalidated and enforced, or exists without overwrite
319
+ """
320
+ if not isinstance(directory, (Path, str)):
321
+ raise TypeError(f"directory must be a Path/str, not {type(directory)}")
322
+
323
+ if self.is_path:
324
+ raise ValueError(
325
+ f"Data '{self.name}' is not imported. Run self.import_data()"
326
+ )
327
+
328
+ if not self.validated:
329
+ if enforce_validation:
330
+ raise ValueError(
331
+ f"DatasetItem '{self.name}' has not been validated. "
332
+ + "Please first run self.validate() on the DatasetItem or Dataset."
333
+ )
334
+
335
+ directory = Path(directory)
336
+ filename = f"{self.name}.csv"
337
+
338
+ if not directory.exists():
339
+ directory.mkdir(parents=True)
340
+
341
+ out_path = directory / filename
342
+ if out_path.exists() and not overwrite:
343
+ raise ValueError(f"File exists and overwrite=False: {out_path}")
344
+
345
+ self.data.to_csv(out_path, index=False)
346
+
347
+ # Helpers
348
+ def _attach_table_dictionary(self, table_dictionary: Table):
349
+ object.__setattr__(self, "table_dictionary", table_dictionary)
350
+ object.__setattr__(self, "validated", False)
351
+
352
+ def _set_padding(self, padding: int):
353
+ object.__setattr__(self, "_padding", padding)
354
+
355
+ def __check_dictionary(self):
356
+ if self.table_dictionary is None or not isinstance(
357
+ self.table_dictionary, Table
358
+ ):
359
+ raise DataDictionaryImportError(
360
+ "Data Dictionary not yet imported or generated. "
361
+ + "Validation must first have a Data Dictionary. "
362
+ + "Please first run DataSet.import_dictionary(), including `primary_keys`."
363
+ )
364
+
365
+ self.table_dictionary.check()
366
+
367
+
368
+ class Dataset(list[DatasetItem]):
369
+ """
370
+ Summary:
371
+ A list-like container of DatasetItem with helpful name-based accessors and
372
+ creators. Also holds an optional Dictionary and can generate one from
373
+ the current items.
374
+
375
+ Arguments:
376
+ items (Iterable[DatasetItem] | None): An iterable of DatasetItem objects.
377
+ """
378
+
379
+ # Properties
380
+ @property
381
+ def validated(self) -> bool:
382
+ return all([item.validated for item in self])
383
+
384
+ # Magic
385
+ def __init__(self, items: Iterable[DatasetItem] | None = None) -> None:
386
+ super().__init__(items or [])
387
+ self.dictionary: Dictionary | None = None
388
+ self.issues: Issues = None
389
+
390
+ def __repr__(self) -> str:
391
+ base = f"Dataset(len={len(self)}, dictionary_loaded={self._dd_loaded()}"
392
+ items_str = (
393
+ ")"
394
+ if not len(self)
395
+ else f", {list_as_bullets(elements=[str(d) for d in self])}\n)"
396
+ )
397
+
398
+ return f"{base}{items_str}"
399
+
400
+ # Creation
401
+ @classmethod
402
+ def create_from(
403
+ cls,
404
+ dataset: Path | str | dict[str, DataFrame],
405
+ *,
406
+ overwrite: bool = False,
407
+ ) -> Dataset:
408
+ """Build a Dataset from a path (file/dir) or dictionary of {name: DataFrame}."""
409
+ if not isinstance(dataset, (Path, str, dict)):
410
+ raise TypeError(
411
+ f"dataset must be a Path/str or dict[str, DataFrame], not {type(dataset)}"
412
+ )
413
+
414
+ ds = cls()
415
+
416
+ # Path-like input
417
+ if isinstance(dataset, (Path, str)):
418
+ items = cls._items_from_pathlike(Path(dataset))
419
+ if len(items) == 1:
420
+ ds.add(items[0], overwrite=overwrite)
421
+ else:
422
+ ds.extend_add(items, overwrite=overwrite)
423
+
424
+ # Iterable input
425
+ else:
426
+ items = [
427
+ cls._make_item(name=name, data=data) for name, data in dataset.items()
428
+ ]
429
+ ds.extend_add(items, overwrite=overwrite)
430
+
431
+ # Set Padding
432
+ max_length = max(len(item.name) for item in ds)
433
+ for item in ds:
434
+ padding = max_length - len(item.name)
435
+ item._set_padding(padding)
436
+ return ds
437
+
438
+ # Getters
439
+ def get(self, name: str, default: DatasetItem | None = None) -> DatasetItem | None:
440
+ name_key = name.strip()
441
+ for item in self:
442
+ if item.name.lower() == name_key.lower():
443
+ return item
444
+ return default
445
+
446
+ def index_of(self, name: str) -> int | None:
447
+ name_key = name.strip()
448
+ for i, item in enumerate(self):
449
+ if item.name == name_key:
450
+ return i
451
+ return None
452
+
453
+ # Manipulation
454
+ def add(self, item: DatasetItem, *, overwrite: bool = False) -> None:
455
+ """
456
+ Summary:
457
+ Add a new DatasetItem to the end of the Dataset, optionally
458
+ overwriting any existing item with the same name.
459
+
460
+ Arguments:
461
+ item (DatasetItem): The DatasetItem to be added.
462
+ overwrite (bool): Whether to overwrite any existing item with the same name.
463
+ Defaults to False.
464
+
465
+ Raises:
466
+ ValueError: If an item with the same name already exists and overwrite is False.
467
+ """
468
+ existing_index = self.index_of(item.name)
469
+ if existing_index is not None and not overwrite:
470
+ raise ValueError(
471
+ f"Item with name '{item.name}' already exists. Use overwrite=True to replace."
472
+ )
473
+ if existing_index is None:
474
+ self.append(item)
475
+ else:
476
+ self[existing_index] = item
477
+
478
+ def extend_add(
479
+ self, items: Iterable[DatasetItem], *, overwrite: bool = False
480
+ ) -> None:
481
+ """
482
+ Summary:
483
+ Extend the Dataset by adding multiple DatasetItems.
484
+
485
+ Arguments:
486
+ items (Iterable[DatasetItem]): An iterable of DatasetItems to be added.
487
+ overwrite (bool): Whether to overwrite any existing item with the same name.
488
+ Defaults to False.
489
+
490
+ Raises:
491
+ ValueError: If an item with the same name already exists and overwrite is False.
492
+ """
493
+ for it in items:
494
+ self.add(it, overwrite=overwrite)
495
+
496
+ # Data Dictionary
497
+ def import_dictionary(self, dictionary: Dictionary | Path | str) -> None:
498
+ """
499
+ Summary:
500
+ Attach a dictionary to the Dataset.
501
+
502
+ Arguments:
503
+ dictionary (Dictionary | Path | str): A dictionary to be attached, either as a Dictionary object
504
+ or a Path/str filepath to compatible dictionary .xlsx file.
505
+
506
+ Raises:
507
+ TypeError: If the dictionary is not a Dictionary instance or a Path/str to an importable file.
508
+ """
509
+ if isinstance(dictionary, Dictionary):
510
+ self.dictionary = dictionary
511
+ elif isinstance(dictionary, (Path, str)):
512
+ path = Path(dictionary)
513
+ self.dictionary = import_dictionary(path)
514
+ else:
515
+ raise TypeError(
516
+ "dictionary must be a Dictionary instance or a Path/str to an importable file."
517
+ )
518
+
519
+ self._attach_table_dictionaries()
520
+
521
+ # Data Dictionary
522
+ def export_dictionary(
523
+ self,
524
+ directory: Path | str,
525
+ filename: str | None = None,
526
+ overwrite: bool = False,
527
+ debug: bool = False,
528
+ _template_path: Path | str | None = None,
529
+ ):
530
+ """
531
+ Summary:
532
+ Export a data dictionary to an Excel file.
533
+
534
+ Arguments:
535
+ directory (Path | str): The directory to export to.
536
+ filename (str | None): The filename to export to (default is None).
537
+ overwrite (bool): Whether to overwrite existing file (default is False).
538
+ debug (bool): Whether to print debug information (default is False).
539
+ _template_path (Path | str | None): The path to the template data dictionary
540
+ (default is None; changing not advised).
541
+
542
+ Returns:
543
+ None
544
+
545
+ Raises:
546
+ FileNotFoundError: If the directory specified by directory does not exist.
547
+ ValueError: If the file specified by filename already exists and overwrite is False.
548
+ """
549
+ if getattr(self, "dictionary", None) is None:
550
+ raise ValueError("No Dictionary attached to this Dataset.")
551
+ from valediction.dictionary.exporting import (
552
+ export_dictionary, # Avoid circular import
553
+ )
554
+
555
+ return export_dictionary(
556
+ dictionary=self.dictionary, # type: ignore[arg-type]
557
+ directory=directory,
558
+ filename=filename,
559
+ overwrite=overwrite,
560
+ debug=debug,
561
+ _template_path=_template_path,
562
+ )
563
+
564
+ def generate_dictionary(
565
+ self,
566
+ dictionary_name: str | None = None,
567
+ primary_keys: dict[str, list[str | int]] | None = None,
568
+ feedback: bool = True,
569
+ debug: bool = False,
570
+ chunk_size: int | None = 10_000_000,
571
+ sample_rows: int | None = None,
572
+ ) -> Dictionary:
573
+ """
574
+ Summary:
575
+ Generate a dictionary from a Dataset.
576
+
577
+ Arguments:
578
+ dictionary_name (str | None): The name of the dictionary to generate.
579
+ If None, will not be set.
580
+ primary_keys (dict[str, list[str | int]] | None): A dictionary of primary keys
581
+ to set on the generated dictionary. If None, will not be set.
582
+ feedback (bool): Provide user feedback on progress (default: True)
583
+ debug (bool): Enable debug mode, providing full log of data type inference and
584
+ reasoning (default: False)
585
+ chunk_size (int | None): Size of chunks for reading data to optimise RAM usage,
586
+ if reading from CSV (default: 10_000_000)
587
+ sample_rows (int | None): Number of rows to sample for data type inference. Note:
588
+ this overrides `chunk_size` and reads in a single chunk (default: None)
589
+
590
+ Returns:
591
+ Dictionary: The generated dictionary.
592
+ """
593
+ generator = Generator(
594
+ feedback=feedback,
595
+ debug=debug,
596
+ chunk_size=chunk_size,
597
+ sample_rows=sample_rows,
598
+ )
599
+ dictionary = generator.generate_dictionary(
600
+ self,
601
+ dictionary_name=dictionary_name,
602
+ primary_keys=primary_keys,
603
+ )
604
+ self.dictionary = dictionary
605
+ self._attach_table_dictionaries()
606
+ return dictionary
607
+
608
+ # Data
609
+ def import_data(
610
+ self,
611
+ name: str | None = None,
612
+ ) -> None:
613
+ """
614
+ Summary:
615
+ Import data from CSV files into the Dataset.
616
+
617
+ Arguments:
618
+ name (str | None): The name of the table to import data into. If None, all tables are imported.
619
+
620
+ Raises:
621
+ FileNotFoundError: If the file specified by name does not exist.
622
+ """
623
+ if name:
624
+ self[name].import_data()
625
+
626
+ else:
627
+ for item in self:
628
+ if item.is_path:
629
+ item.import_data()
630
+
631
+ def export_data(
632
+ self,
633
+ directory: Path | str,
634
+ overwrite: bool = False,
635
+ enforce_validation: bool = True,
636
+ ):
637
+ """Export items from Dataset data to csv, if imported. Unimported items are
638
+ skipped. Unvalidated items are skipped if enforce_validation is True.
639
+
640
+ Args:
641
+ directory (Path | str): Directory to export csv files.
642
+ overwrite (bool, optional): Overwrite existing files on conflict. Defaults to False.
643
+ enforce_validation (bool, optional): Raise error if unvalidated. Defaults to True.
644
+
645
+ Raises:
646
+ ValueError: If files exists without overwrite=True.
647
+ """
648
+ if not isinstance(directory, (Path, str)):
649
+ raise TypeError(f"directory must be a Path/str, not {type(directory)}")
650
+ print("Exporting data...")
651
+ # Check for issues
652
+ unimported_items = [item for item in self if item.is_path]
653
+ unvalidated_items = [
654
+ item for item in self if item.is_dataframe and not item.validated
655
+ ]
656
+
657
+ if unimported_items:
658
+ print_bold_red("WARNING: Skipping unimported tables: ", end="")
659
+ print_red(list_as_bullets([item.name for item in unimported_items]))
660
+
661
+ if unvalidated_items and enforce_validation:
662
+ print_bold_red("WARNING: Skipping unvalidated tables: ", end="")
663
+ print_red(list_as_bullets([item.name for item in unvalidated_items]))
664
+
665
+ # Set exportable
666
+ exportable: list[DatasetItem] = []
667
+ for item in self:
668
+ if item.is_dataframe:
669
+ if item.validated or not enforce_validation:
670
+ exportable.append(item)
671
+
672
+ directory = Path(directory)
673
+ filenames = [directory / f"{item.name}.csv" for item in exportable]
674
+
675
+ # Check for conflicts and overwrite config
676
+ conflicts = [str(filename) for filename in filenames if filename.exists()]
677
+ if conflicts and not overwrite:
678
+ raise ValueError(
679
+ f"File exists and overwrite=False: {list_as_bullets(conflicts)}"
680
+ )
681
+
682
+ # Export
683
+ for item in exportable:
684
+ print(f" - exporting '{item.name}'")
685
+ item.export_data(directory, overwrite=overwrite, enforce_validation=False)
686
+
687
+ print(f"Export complete ({len(exportable)} tables)")
688
+
689
+ def apply_dictionary(self, name: str | None = None) -> None:
690
+ """
691
+ Summary:
692
+ Apply a dictionary to a Dataset.
693
+
694
+ Arguments:
695
+ name (str | None): The name of the table to apply the dictionary to. If None, all tables are applied.
696
+
697
+ Returns:
698
+ None
699
+
700
+ Raises:
701
+ ValueError: If the Dataset does not contain a dictionary.
702
+ """
703
+ if name:
704
+ self[name].apply_dictionary()
705
+
706
+ else:
707
+ for item in self:
708
+ item.apply_dictionary()
709
+
710
+ # Validation
711
+ def validate(
712
+ self,
713
+ chunk_size: int = None,
714
+ feedback: bool = True,
715
+ ) -> None:
716
+ """
717
+ Summary:
718
+ Validate data in the Dataset against the dictionary.
719
+
720
+ Arguments:
721
+ chunk_size (int): Size of chunks for validating data to optimise RAM usage.
722
+ feedback (bool): Provide user feedback on progress (default: True)
723
+
724
+ Returns:
725
+ None
726
+
727
+ Raises:
728
+ DataIntegrityError: If there is an issue with the integrity of the data
729
+ DataDictionaryImportError: If there is an issue with importing the dictionary
730
+ """
731
+ if feedback:
732
+ print(f"Validating {len(self)} tables")
733
+ self.__check_dictionary()
734
+ for item in self:
735
+ try:
736
+ item.validate(
737
+ chunk_size=chunk_size,
738
+ feedback=feedback,
739
+ )
740
+ except DataIntegrityError:
741
+ pass
742
+
743
+ self.__reattach_issues()
744
+
745
+ # Report Issues
746
+ try:
747
+ self.check(readout=True)
748
+ except DataIntegrityError:
749
+ pass
750
+
751
+ if feedback:
752
+ print("\n", end="")
753
+
754
+ def __reattach_issues(self):
755
+ self.issues = Issues()
756
+ for item in self:
757
+ self.issues.extend(item.issues)
758
+
759
+ def __items_with_issues(self) -> list[str]:
760
+ items_with_issues = [item.name for item in self if len(item.issues) > 0]
761
+ string = (
762
+ ("" + ",".join(items_with_issues) + "")
763
+ if len(items_with_issues) > 0
764
+ else ""
765
+ )
766
+ return string
767
+
768
+ def check(self, readout: bool = False) -> bool:
769
+ """
770
+ Summary:
771
+ Check the validity of the Dataset.
772
+
773
+ Raises:
774
+ DataIntegrityError: If there is an issue with the integrity of the data, either because:
775
+ - the Dataset is not yet validated
776
+ - there are issues with the integrity of the data
777
+ """
778
+ error = (
779
+ f"WARNING: Unvalidated tables or issues detected in {self.__items_with_issues()}:"
780
+ if len(self.issues) > 0
781
+ else "Dataset not yet validated"
782
+ if not self.validated
783
+ else ""
784
+ )
785
+ if error:
786
+ if readout:
787
+ print_bold_red(f"\n{error}")
788
+ print_red(self.issues)
789
+ raise DataIntegrityError(f"{error}\n{self.issues}")
790
+ else:
791
+ return True
792
+
793
+ # Creation Helpers
794
+ @staticmethod
795
+ def _make_item(
796
+ name: str | None,
797
+ data: DataLike,
798
+ ) -> DatasetItem:
799
+ """Normalise a (data, name) double into a DatasetItem."""
800
+ if isinstance(data, (str, Path)):
801
+ path = Path(data)
802
+ if not path.exists():
803
+ raise FileNotFoundError(f"File not found: {path}")
804
+ if path.suffix.lower() != ".csv":
805
+ raise ValueError(f"Only .csv supported right now, got: {path}")
806
+ resolved_name = _normalise_name(name or path.stem)
807
+ return DatasetItem(name=resolved_name, data=path.resolve())
808
+
809
+ if isinstance(data, DataFrame):
810
+ if not name:
811
+ raise ValueError("When providing a DataFrame, 'name' is required.")
812
+ resolved_name = _normalise_name(name)
813
+ return DatasetItem(name=resolved_name, data=data)
814
+
815
+ raise TypeError("data must be a Path/str to .csv or a pandas DataFrame.")
816
+
817
+ @staticmethod
818
+ def _items_from_pathlike(p: Path) -> list[DatasetItem]:
819
+ """Expand a file/dir path into DatasetItems (non-recursive for dirs)."""
820
+ if not p.exists():
821
+ raise FileNotFoundError(f"Path not found: {p}")
822
+
823
+ if p.is_file():
824
+ if p.suffix.lower() != ".csv":
825
+ raise ValueError(f"Expected a .csv file, got: {p.suffix} ({p})")
826
+ return [DatasetItem(name=_normalise_name(p.stem), data=p.resolve())]
827
+
828
+ if p.is_dir():
829
+ return [
830
+ DatasetItem(
831
+ name=_normalise_name(csv_path.stem), data=csv_path.resolve()
832
+ )
833
+ for csv_path in p.glob("*.csv")
834
+ ]
835
+
836
+ raise ValueError(f"Unsupported path type: {p}")
837
+
838
+ # Validation Helpers
839
+ def __check_dictionary(self):
840
+ if self.dictionary is None or not isinstance(self.dictionary, Dictionary):
841
+ raise DataDictionaryImportError(
842
+ "Data Dictionary not yet imported or generated. "
843
+ + "Validation must first have a Data Dictionary. "
844
+ + "Please first run DataSet.import_dictionary(), including `primary_keys`."
845
+ )
846
+
847
+ self.dictionary.check()
848
+
849
+ # Other Helpers
850
+ def __getitem__(self, key: int | str) -> DatasetItem:
851
+ if isinstance(key, int):
852
+ return super().__getitem__(key)
853
+ found = self.get(key)
854
+ if found is None:
855
+ raise KeyError(f"No DatasetItem with name '{key}'.")
856
+ return found
857
+
858
+ def _dd_loaded(self):
859
+ return self.dictionary is not None
860
+
861
+ def _attach_table_dictionaries(self):
862
+ for dataset_item in self:
863
+ table_name = dataset_item.name
864
+ table_dictionary = self.dictionary.get_table(table_name)
865
+ if not table_dictionary:
866
+ raise DataDictionaryImportError(
867
+ f"No dictionary table found for '{table_name}'"
868
+ )
869
+
870
+ dataset_item._attach_table_dictionary(table_dictionary)