valediction 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- valediction/__init__.py +8 -0
- valediction/convenience.py +50 -0
- valediction/data_types/__init__.py +0 -0
- valediction/data_types/data_type_helpers.py +75 -0
- valediction/data_types/data_types.py +58 -0
- valediction/data_types/type_inference.py +541 -0
- valediction/datasets/__init__.py +0 -0
- valediction/datasets/datasets.py +870 -0
- valediction/datasets/datasets_helpers.py +46 -0
- valediction/demo/DEMO - Data Dictionary.xlsx +0 -0
- valediction/demo/DEMOGRAPHICS.csv +101 -0
- valediction/demo/DIAGNOSES.csv +650 -0
- valediction/demo/LAB_TESTS.csv +1001 -0
- valediction/demo/VITALS.csv +1001 -0
- valediction/demo/__init__.py +6 -0
- valediction/demo/demo_dictionary.py +129 -0
- valediction/dictionary/__init__.py +0 -0
- valediction/dictionary/exporting.py +501 -0
- valediction/dictionary/exporting_helpers.py +371 -0
- valediction/dictionary/generation.py +357 -0
- valediction/dictionary/helpers.py +174 -0
- valediction/dictionary/importing.py +494 -0
- valediction/dictionary/integrity.py +37 -0
- valediction/dictionary/model.py +582 -0
- valediction/dictionary/template/PROJECT - Data Dictionary.xltx +0 -0
- valediction/exceptions.py +22 -0
- valediction/integrity.py +97 -0
- valediction/io/__init__.py +0 -0
- valediction/io/csv_readers.py +307 -0
- valediction/progress.py +206 -0
- valediction/support.py +72 -0
- valediction/validation/__init__.py +0 -0
- valediction/validation/helpers.py +315 -0
- valediction/validation/issues.py +280 -0
- valediction/validation/validation.py +598 -0
- valediction-1.0.0.dist-info/METADATA +15 -0
- valediction-1.0.0.dist-info/RECORD +38 -0
- valediction-1.0.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,870 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from datetime import timedelta
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Iterable, Iterator
|
|
7
|
+
|
|
8
|
+
from pandas import DataFrame
|
|
9
|
+
|
|
10
|
+
from valediction.datasets.datasets_helpers import DataLike
|
|
11
|
+
from valediction.dictionary.generation import Generator
|
|
12
|
+
from valediction.dictionary.importing import import_dictionary
|
|
13
|
+
from valediction.dictionary.model import Dictionary, Table
|
|
14
|
+
from valediction.exceptions import DataDictionaryImportError, DataIntegrityError
|
|
15
|
+
from valediction.io.csv_readers import (
|
|
16
|
+
FrameChunk,
|
|
17
|
+
iter_csv_chunks,
|
|
18
|
+
read_csv_all,
|
|
19
|
+
read_csv_headers,
|
|
20
|
+
)
|
|
21
|
+
from valediction.support import (
|
|
22
|
+
_get_runtime_string,
|
|
23
|
+
_normalise_name,
|
|
24
|
+
list_as_bullets,
|
|
25
|
+
print_bold_red,
|
|
26
|
+
print_red,
|
|
27
|
+
)
|
|
28
|
+
from valediction.validation.helpers import apply_data_types
|
|
29
|
+
from valediction.validation.issues import Issues
|
|
30
|
+
from valediction.validation.validation import Validator
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass()
|
|
34
|
+
class DatasetItem:
|
|
35
|
+
"""
|
|
36
|
+
Summary:
|
|
37
|
+
Represents a single table binding for validation.
|
|
38
|
+
|
|
39
|
+
Attributes:
|
|
40
|
+
name (str): table name
|
|
41
|
+
data (Path | DataFrame): DataFrame or path to csv
|
|
42
|
+
validated (bool): whether the table has been successfully validated
|
|
43
|
+
table_dictionary (Table | None): dictionary Table object for the DatasetItem
|
|
44
|
+
validator (Validator): validator object
|
|
45
|
+
issues (Issues): contains validation issues/deviations from the dictionary
|
|
46
|
+
validation_runtimes (dict[str, str]): validation runtime
|
|
47
|
+
dictionary_runtimes (dict[str, str]): dictionary generation runtime
|
|
48
|
+
|
|
49
|
+
Raises:
|
|
50
|
+
DataDictionaryImportError: if there is an issue with importing the dictionary
|
|
51
|
+
DataIntegrityError: if there is an issue with the integrity of the data
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
name: str
|
|
55
|
+
data: Path | DataFrame
|
|
56
|
+
validated: bool = False
|
|
57
|
+
table_dictionary: Table | None = None
|
|
58
|
+
validator: Validator = None
|
|
59
|
+
issues: Issues = None
|
|
60
|
+
_validation_runtimes: dict[str, timedelta] = None
|
|
61
|
+
_dictionary_runtimes: dict[str, timedelta] = None
|
|
62
|
+
_padding: int = 0
|
|
63
|
+
|
|
64
|
+
def __post_init__(self):
|
|
65
|
+
object.__setattr__(self, "issues", Issues())
|
|
66
|
+
|
|
67
|
+
# Properties
|
|
68
|
+
@property
|
|
69
|
+
def validation_runtimes(self) -> dict[str, str]:
|
|
70
|
+
if not self._validation_runtimes:
|
|
71
|
+
return {}
|
|
72
|
+
|
|
73
|
+
return {
|
|
74
|
+
step: _get_runtime_string(time_delta)
|
|
75
|
+
for step, time_delta in self._validation_runtimes.items()
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def dictionary_runtimes(self) -> dict[str, str]:
|
|
80
|
+
if not self._dictionary_runtimes:
|
|
81
|
+
return {}
|
|
82
|
+
|
|
83
|
+
return {
|
|
84
|
+
step: _get_runtime_string(time_delta)
|
|
85
|
+
for step, time_delta in self._dictionary_runtimes.items()
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def is_dataframe(self) -> bool:
|
|
90
|
+
return isinstance(self.data, DataFrame)
|
|
91
|
+
|
|
92
|
+
@property
|
|
93
|
+
def is_path(self) -> bool:
|
|
94
|
+
return isinstance(self.data, Path)
|
|
95
|
+
|
|
96
|
+
@property
|
|
97
|
+
def column_count(self) -> int:
|
|
98
|
+
if isinstance(self.data, DataFrame):
|
|
99
|
+
return self.data.shape[1]
|
|
100
|
+
else:
|
|
101
|
+
return read_csv_headers(path=self.data).shape[1]
|
|
102
|
+
|
|
103
|
+
@property
|
|
104
|
+
def primary_keys(self) -> list[str]:
|
|
105
|
+
if not self.table_dictionary:
|
|
106
|
+
raise DataDictionaryImportError(
|
|
107
|
+
"No dictionary attached to table - please import_dictionary() or generate_dictionary() first"
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
return self.table_dictionary.get_primary_keys()
|
|
111
|
+
|
|
112
|
+
@property
|
|
113
|
+
def headers(self) -> list[str]:
|
|
114
|
+
if self.is_dataframe:
|
|
115
|
+
return list(self.data.columns)
|
|
116
|
+
elif self.is_path:
|
|
117
|
+
return list(read_csv_headers(path=self.data).columns)
|
|
118
|
+
else:
|
|
119
|
+
raise TypeError("self.data must be a DataFrame or str/Path to .csv")
|
|
120
|
+
|
|
121
|
+
# Magic
|
|
122
|
+
def __repr__(self) -> str:
|
|
123
|
+
if isinstance(self.data, DataFrame):
|
|
124
|
+
shape = f"{self.data.shape[0]}x{self.data.shape[1]}"
|
|
125
|
+
data_repr = f"DataFrame[{shape}]"
|
|
126
|
+
elif isinstance(self.data, Path):
|
|
127
|
+
data_repr = f"Path('{self.data.name}')"
|
|
128
|
+
else:
|
|
129
|
+
data_repr = repr(self.data)
|
|
130
|
+
return (
|
|
131
|
+
f"DatasetItem(name={self.name!r}, data={data_repr}, "
|
|
132
|
+
f"validated={self.validated})"
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# Validation
|
|
136
|
+
def validate(
|
|
137
|
+
self,
|
|
138
|
+
chunk_size: int = None,
|
|
139
|
+
feedback: bool = True,
|
|
140
|
+
):
|
|
141
|
+
"""
|
|
142
|
+
Summary:
|
|
143
|
+
Validates the dataset item against the dictionary.
|
|
144
|
+
Warns if there are issues with the integrity of the data.
|
|
145
|
+
|
|
146
|
+
Arguments:
|
|
147
|
+
chunk_size (int): Size of chunks for validating data to optimise RAM usage,
|
|
148
|
+
if reading from CSV (default: 10_000_000)
|
|
149
|
+
feedback (bool): Provide user feedback on progress (default: True)
|
|
150
|
+
|
|
151
|
+
Raises:
|
|
152
|
+
DataDictionaryImportError: if there is an issue with importing the dictionary
|
|
153
|
+
"""
|
|
154
|
+
self.__check_dictionary()
|
|
155
|
+
validator = Validator(
|
|
156
|
+
dataset_item=self,
|
|
157
|
+
table_dictionary=self.table_dictionary,
|
|
158
|
+
chunk_size=chunk_size,
|
|
159
|
+
feedback=feedback,
|
|
160
|
+
_padding=self._padding,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
object.__setattr__(self, "validator", validator)
|
|
164
|
+
try:
|
|
165
|
+
validator.validate()
|
|
166
|
+
object.__setattr__(self, "validated", True)
|
|
167
|
+
object.__setattr__(self, "issues", Issues())
|
|
168
|
+
if self.is_dataframe:
|
|
169
|
+
self.apply_dictionary()
|
|
170
|
+
|
|
171
|
+
# Issues detected
|
|
172
|
+
except DataIntegrityError:
|
|
173
|
+
object.__setattr__(self, "validated", False)
|
|
174
|
+
object.__setattr__(self, "issues", validator.issues)
|
|
175
|
+
|
|
176
|
+
# No Issues
|
|
177
|
+
else:
|
|
178
|
+
object.__setattr__(self, "validated", True)
|
|
179
|
+
|
|
180
|
+
finally:
|
|
181
|
+
object.__setattr__(self, "_validation_runtimes", validator._runtimes)
|
|
182
|
+
|
|
183
|
+
# Warn Issues
|
|
184
|
+
try:
|
|
185
|
+
self.check()
|
|
186
|
+
except DataIntegrityError:
|
|
187
|
+
pass
|
|
188
|
+
|
|
189
|
+
def check(self) -> bool:
|
|
190
|
+
"""
|
|
191
|
+
Summary:
|
|
192
|
+
Check the validity of the DatasetItem.
|
|
193
|
+
|
|
194
|
+
Raises:
|
|
195
|
+
DataIntegrityError: If there is an issue with the integrity of the data, either because:
|
|
196
|
+
- the DatasetItem is not yet validated
|
|
197
|
+
- there are issues with the integrity of the data
|
|
198
|
+
"""
|
|
199
|
+
error = (
|
|
200
|
+
f"Issues detected in {self.name}. Issues:\n{self.issues}"
|
|
201
|
+
if len(self.issues) > 0
|
|
202
|
+
else "DatasetItem not yet validated"
|
|
203
|
+
if not self.validated
|
|
204
|
+
else ""
|
|
205
|
+
)
|
|
206
|
+
if error:
|
|
207
|
+
print_bold_red(f"WARNING: Issues detected in {self.name}.")
|
|
208
|
+
print_red(f"{self.issues}")
|
|
209
|
+
raise DataIntegrityError(error)
|
|
210
|
+
else:
|
|
211
|
+
return True
|
|
212
|
+
|
|
213
|
+
def apply_dictionary(self):
|
|
214
|
+
"""
|
|
215
|
+
Summary:
|
|
216
|
+
Apply a validated Data Dictionary to a validated DatasetItem.
|
|
217
|
+
|
|
218
|
+
Raises:
|
|
219
|
+
DataDictionaryImportError: if no Data Dictionary has been imported or generated and attached to the table
|
|
220
|
+
DataIntegrityError: if the data has not been validated before attempting to apply the dictionary
|
|
221
|
+
"""
|
|
222
|
+
if not self.table_dictionary:
|
|
223
|
+
raise DataDictionaryImportError(
|
|
224
|
+
"No Data Dictionary imported or generated and attached to table. "
|
|
225
|
+
+ "Please first run Dataset.import_dictionary() or Dataset.generate_dictionary() "
|
|
226
|
+
+ " and then Dataset.validate()"
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
if not self.validated:
|
|
230
|
+
raise DataIntegrityError(
|
|
231
|
+
"Cannot apply Data Dictionary to unvalidated data. "
|
|
232
|
+
+ "Please first run DataSet.validate() on the table."
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
if self.is_path:
|
|
236
|
+
self.import_data()
|
|
237
|
+
|
|
238
|
+
object.__setattr__(
|
|
239
|
+
self, "data", apply_data_types(self.data, self.table_dictionary)
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
# Data Import
|
|
243
|
+
def import_data(self):
|
|
244
|
+
"""
|
|
245
|
+
Summary:
|
|
246
|
+
Import the data associated with this DatasetItem into memory.
|
|
247
|
+
|
|
248
|
+
Raises:
|
|
249
|
+
DataIntegrityError: if there is an issue with the integrity of the data
|
|
250
|
+
"""
|
|
251
|
+
if self.is_dataframe:
|
|
252
|
+
print(f"DatasetItem '{self.name}' already imported")
|
|
253
|
+
return
|
|
254
|
+
|
|
255
|
+
else:
|
|
256
|
+
object.__setattr__(self, "data", read_csv_all(self.data).df)
|
|
257
|
+
if self.table_dictionary and self.validated:
|
|
258
|
+
self.apply_dictionary()
|
|
259
|
+
|
|
260
|
+
def iterate_data_chunks(self, chunk_size: int = 10_000_000) -> Iterator[FrameChunk]:
|
|
261
|
+
"""
|
|
262
|
+
Summary:
|
|
263
|
+
Yields data in chunks. If `data` is a DataFrame, yields the whole DataFrame once within
|
|
264
|
+
a FrameChunk. If the Dataset is validated, dtypes will be applied to the DataFrame.
|
|
265
|
+
If not, will warn and return as strings types.
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
chunk_size (int, optional): chunk_size (int | None): Size of chunks for reading data to optimise RAM usage,
|
|
269
|
+
if reading from CSV (default: 10_000_000)
|
|
270
|
+
|
|
271
|
+
Yields:
|
|
272
|
+
Iterator[FrameChunk]: Iterator of FrameChunks, with each chunk containing a DataFrame as `chunk.df`
|
|
273
|
+
"""
|
|
274
|
+
if not self.validated:
|
|
275
|
+
print_bold_red("WARNING: ", end="")
|
|
276
|
+
print_red(
|
|
277
|
+
f"DatasetItem '{self.name}' has not been validated. "
|
|
278
|
+
+ "All data will be yielded with string dtypes."
|
|
279
|
+
)
|
|
280
|
+
if self.is_path:
|
|
281
|
+
for chunk in iter_csv_chunks(path=self.data, chunk_size=chunk_size):
|
|
282
|
+
if self.validated:
|
|
283
|
+
df = apply_data_types(chunk.df, self.table_dictionary)
|
|
284
|
+
chunk.update_df(df)
|
|
285
|
+
yield chunk
|
|
286
|
+
|
|
287
|
+
if self.is_dataframe:
|
|
288
|
+
n = len(self.data)
|
|
289
|
+
# apply_data_types() will already have been applied if validated
|
|
290
|
+
yield FrameChunk(
|
|
291
|
+
df=self.data,
|
|
292
|
+
start=0,
|
|
293
|
+
end=(n - 1) if n else 0,
|
|
294
|
+
total_size=None,
|
|
295
|
+
file_pos=None,
|
|
296
|
+
bytes_read=None,
|
|
297
|
+
chunk_index=1,
|
|
298
|
+
total_bytes_read=None,
|
|
299
|
+
total_chunks_seen=1,
|
|
300
|
+
)
|
|
301
|
+
return
|
|
302
|
+
|
|
303
|
+
# Data Export
|
|
304
|
+
def export_data(
|
|
305
|
+
self,
|
|
306
|
+
directory: Path | str,
|
|
307
|
+
overwrite: bool = False,
|
|
308
|
+
enforce_validation: bool = True,
|
|
309
|
+
):
|
|
310
|
+
"""Export DatasetItem data to csv, if imported.
|
|
311
|
+
|
|
312
|
+
Args:
|
|
313
|
+
directory (Path | str): Directory to export csv file.
|
|
314
|
+
overwrite (bool, optional): Overwrite existing file on conflict. Defaults to False.
|
|
315
|
+
enforce_validation (bool, optional): Raise error if unvalidated. Defaults to True.
|
|
316
|
+
|
|
317
|
+
Raises:
|
|
318
|
+
ValueError: If unimported, unvalidated and enforced, or exists without overwrite
|
|
319
|
+
"""
|
|
320
|
+
if not isinstance(directory, (Path, str)):
|
|
321
|
+
raise TypeError(f"directory must be a Path/str, not {type(directory)}")
|
|
322
|
+
|
|
323
|
+
if self.is_path:
|
|
324
|
+
raise ValueError(
|
|
325
|
+
f"Data '{self.name}' is not imported. Run self.import_data()"
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
if not self.validated:
|
|
329
|
+
if enforce_validation:
|
|
330
|
+
raise ValueError(
|
|
331
|
+
f"DatasetItem '{self.name}' has not been validated. "
|
|
332
|
+
+ "Please first run self.validate() on the DatasetItem or Dataset."
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
directory = Path(directory)
|
|
336
|
+
filename = f"{self.name}.csv"
|
|
337
|
+
|
|
338
|
+
if not directory.exists():
|
|
339
|
+
directory.mkdir(parents=True)
|
|
340
|
+
|
|
341
|
+
out_path = directory / filename
|
|
342
|
+
if out_path.exists() and not overwrite:
|
|
343
|
+
raise ValueError(f"File exists and overwrite=False: {out_path}")
|
|
344
|
+
|
|
345
|
+
self.data.to_csv(out_path, index=False)
|
|
346
|
+
|
|
347
|
+
# Helpers
|
|
348
|
+
def _attach_table_dictionary(self, table_dictionary: Table):
|
|
349
|
+
object.__setattr__(self, "table_dictionary", table_dictionary)
|
|
350
|
+
object.__setattr__(self, "validated", False)
|
|
351
|
+
|
|
352
|
+
def _set_padding(self, padding: int):
|
|
353
|
+
object.__setattr__(self, "_padding", padding)
|
|
354
|
+
|
|
355
|
+
def __check_dictionary(self):
|
|
356
|
+
if self.table_dictionary is None or not isinstance(
|
|
357
|
+
self.table_dictionary, Table
|
|
358
|
+
):
|
|
359
|
+
raise DataDictionaryImportError(
|
|
360
|
+
"Data Dictionary not yet imported or generated. "
|
|
361
|
+
+ "Validation must first have a Data Dictionary. "
|
|
362
|
+
+ "Please first run DataSet.import_dictionary(), including `primary_keys`."
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
self.table_dictionary.check()
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
class Dataset(list[DatasetItem]):
|
|
369
|
+
"""
|
|
370
|
+
Summary:
|
|
371
|
+
A list-like container of DatasetItem with helpful name-based accessors and
|
|
372
|
+
creators. Also holds an optional Dictionary and can generate one from
|
|
373
|
+
the current items.
|
|
374
|
+
|
|
375
|
+
Arguments:
|
|
376
|
+
items (Iterable[DatasetItem] | None): An iterable of DatasetItem objects.
|
|
377
|
+
"""
|
|
378
|
+
|
|
379
|
+
# Properties
|
|
380
|
+
@property
|
|
381
|
+
def validated(self) -> bool:
|
|
382
|
+
return all([item.validated for item in self])
|
|
383
|
+
|
|
384
|
+
# Magic
|
|
385
|
+
def __init__(self, items: Iterable[DatasetItem] | None = None) -> None:
|
|
386
|
+
super().__init__(items or [])
|
|
387
|
+
self.dictionary: Dictionary | None = None
|
|
388
|
+
self.issues: Issues = None
|
|
389
|
+
|
|
390
|
+
def __repr__(self) -> str:
|
|
391
|
+
base = f"Dataset(len={len(self)}, dictionary_loaded={self._dd_loaded()}"
|
|
392
|
+
items_str = (
|
|
393
|
+
")"
|
|
394
|
+
if not len(self)
|
|
395
|
+
else f", {list_as_bullets(elements=[str(d) for d in self])}\n)"
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
return f"{base}{items_str}"
|
|
399
|
+
|
|
400
|
+
# Creation
|
|
401
|
+
@classmethod
|
|
402
|
+
def create_from(
|
|
403
|
+
cls,
|
|
404
|
+
dataset: Path | str | dict[str, DataFrame],
|
|
405
|
+
*,
|
|
406
|
+
overwrite: bool = False,
|
|
407
|
+
) -> Dataset:
|
|
408
|
+
"""Build a Dataset from a path (file/dir) or dictionary of {name: DataFrame}."""
|
|
409
|
+
if not isinstance(dataset, (Path, str, dict)):
|
|
410
|
+
raise TypeError(
|
|
411
|
+
f"dataset must be a Path/str or dict[str, DataFrame], not {type(dataset)}"
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
ds = cls()
|
|
415
|
+
|
|
416
|
+
# Path-like input
|
|
417
|
+
if isinstance(dataset, (Path, str)):
|
|
418
|
+
items = cls._items_from_pathlike(Path(dataset))
|
|
419
|
+
if len(items) == 1:
|
|
420
|
+
ds.add(items[0], overwrite=overwrite)
|
|
421
|
+
else:
|
|
422
|
+
ds.extend_add(items, overwrite=overwrite)
|
|
423
|
+
|
|
424
|
+
# Iterable input
|
|
425
|
+
else:
|
|
426
|
+
items = [
|
|
427
|
+
cls._make_item(name=name, data=data) for name, data in dataset.items()
|
|
428
|
+
]
|
|
429
|
+
ds.extend_add(items, overwrite=overwrite)
|
|
430
|
+
|
|
431
|
+
# Set Padding
|
|
432
|
+
max_length = max(len(item.name) for item in ds)
|
|
433
|
+
for item in ds:
|
|
434
|
+
padding = max_length - len(item.name)
|
|
435
|
+
item._set_padding(padding)
|
|
436
|
+
return ds
|
|
437
|
+
|
|
438
|
+
# Getters
|
|
439
|
+
def get(self, name: str, default: DatasetItem | None = None) -> DatasetItem | None:
|
|
440
|
+
name_key = name.strip()
|
|
441
|
+
for item in self:
|
|
442
|
+
if item.name.lower() == name_key.lower():
|
|
443
|
+
return item
|
|
444
|
+
return default
|
|
445
|
+
|
|
446
|
+
def index_of(self, name: str) -> int | None:
|
|
447
|
+
name_key = name.strip()
|
|
448
|
+
for i, item in enumerate(self):
|
|
449
|
+
if item.name == name_key:
|
|
450
|
+
return i
|
|
451
|
+
return None
|
|
452
|
+
|
|
453
|
+
# Manipulation
|
|
454
|
+
def add(self, item: DatasetItem, *, overwrite: bool = False) -> None:
|
|
455
|
+
"""
|
|
456
|
+
Summary:
|
|
457
|
+
Add a new DatasetItem to the end of the Dataset, optionally
|
|
458
|
+
overwriting any existing item with the same name.
|
|
459
|
+
|
|
460
|
+
Arguments:
|
|
461
|
+
item (DatasetItem): The DatasetItem to be added.
|
|
462
|
+
overwrite (bool): Whether to overwrite any existing item with the same name.
|
|
463
|
+
Defaults to False.
|
|
464
|
+
|
|
465
|
+
Raises:
|
|
466
|
+
ValueError: If an item with the same name already exists and overwrite is False.
|
|
467
|
+
"""
|
|
468
|
+
existing_index = self.index_of(item.name)
|
|
469
|
+
if existing_index is not None and not overwrite:
|
|
470
|
+
raise ValueError(
|
|
471
|
+
f"Item with name '{item.name}' already exists. Use overwrite=True to replace."
|
|
472
|
+
)
|
|
473
|
+
if existing_index is None:
|
|
474
|
+
self.append(item)
|
|
475
|
+
else:
|
|
476
|
+
self[existing_index] = item
|
|
477
|
+
|
|
478
|
+
def extend_add(
|
|
479
|
+
self, items: Iterable[DatasetItem], *, overwrite: bool = False
|
|
480
|
+
) -> None:
|
|
481
|
+
"""
|
|
482
|
+
Summary:
|
|
483
|
+
Extend the Dataset by adding multiple DatasetItems.
|
|
484
|
+
|
|
485
|
+
Arguments:
|
|
486
|
+
items (Iterable[DatasetItem]): An iterable of DatasetItems to be added.
|
|
487
|
+
overwrite (bool): Whether to overwrite any existing item with the same name.
|
|
488
|
+
Defaults to False.
|
|
489
|
+
|
|
490
|
+
Raises:
|
|
491
|
+
ValueError: If an item with the same name already exists and overwrite is False.
|
|
492
|
+
"""
|
|
493
|
+
for it in items:
|
|
494
|
+
self.add(it, overwrite=overwrite)
|
|
495
|
+
|
|
496
|
+
# Data Dictionary
|
|
497
|
+
def import_dictionary(self, dictionary: Dictionary | Path | str) -> None:
|
|
498
|
+
"""
|
|
499
|
+
Summary:
|
|
500
|
+
Attach a dictionary to the Dataset.
|
|
501
|
+
|
|
502
|
+
Arguments:
|
|
503
|
+
dictionary (Dictionary | Path | str): A dictionary to be attached, either as a Dictionary object
|
|
504
|
+
or a Path/str filepath to compatible dictionary .xlsx file.
|
|
505
|
+
|
|
506
|
+
Raises:
|
|
507
|
+
TypeError: If the dictionary is not a Dictionary instance or a Path/str to an importable file.
|
|
508
|
+
"""
|
|
509
|
+
if isinstance(dictionary, Dictionary):
|
|
510
|
+
self.dictionary = dictionary
|
|
511
|
+
elif isinstance(dictionary, (Path, str)):
|
|
512
|
+
path = Path(dictionary)
|
|
513
|
+
self.dictionary = import_dictionary(path)
|
|
514
|
+
else:
|
|
515
|
+
raise TypeError(
|
|
516
|
+
"dictionary must be a Dictionary instance or a Path/str to an importable file."
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
self._attach_table_dictionaries()
|
|
520
|
+
|
|
521
|
+
# Data Dictionary
|
|
522
|
+
def export_dictionary(
|
|
523
|
+
self,
|
|
524
|
+
directory: Path | str,
|
|
525
|
+
filename: str | None = None,
|
|
526
|
+
overwrite: bool = False,
|
|
527
|
+
debug: bool = False,
|
|
528
|
+
_template_path: Path | str | None = None,
|
|
529
|
+
):
|
|
530
|
+
"""
|
|
531
|
+
Summary:
|
|
532
|
+
Export a data dictionary to an Excel file.
|
|
533
|
+
|
|
534
|
+
Arguments:
|
|
535
|
+
directory (Path | str): The directory to export to.
|
|
536
|
+
filename (str | None): The filename to export to (default is None).
|
|
537
|
+
overwrite (bool): Whether to overwrite existing file (default is False).
|
|
538
|
+
debug (bool): Whether to print debug information (default is False).
|
|
539
|
+
_template_path (Path | str | None): The path to the template data dictionary
|
|
540
|
+
(default is None; changing not advised).
|
|
541
|
+
|
|
542
|
+
Returns:
|
|
543
|
+
None
|
|
544
|
+
|
|
545
|
+
Raises:
|
|
546
|
+
FileNotFoundError: If the directory specified by directory does not exist.
|
|
547
|
+
ValueError: If the file specified by filename already exists and overwrite is False.
|
|
548
|
+
"""
|
|
549
|
+
if getattr(self, "dictionary", None) is None:
|
|
550
|
+
raise ValueError("No Dictionary attached to this Dataset.")
|
|
551
|
+
from valediction.dictionary.exporting import (
|
|
552
|
+
export_dictionary, # Avoid circular import
|
|
553
|
+
)
|
|
554
|
+
|
|
555
|
+
return export_dictionary(
|
|
556
|
+
dictionary=self.dictionary, # type: ignore[arg-type]
|
|
557
|
+
directory=directory,
|
|
558
|
+
filename=filename,
|
|
559
|
+
overwrite=overwrite,
|
|
560
|
+
debug=debug,
|
|
561
|
+
_template_path=_template_path,
|
|
562
|
+
)
|
|
563
|
+
|
|
564
|
+
def generate_dictionary(
|
|
565
|
+
self,
|
|
566
|
+
dictionary_name: str | None = None,
|
|
567
|
+
primary_keys: dict[str, list[str | int]] | None = None,
|
|
568
|
+
feedback: bool = True,
|
|
569
|
+
debug: bool = False,
|
|
570
|
+
chunk_size: int | None = 10_000_000,
|
|
571
|
+
sample_rows: int | None = None,
|
|
572
|
+
) -> Dictionary:
|
|
573
|
+
"""
|
|
574
|
+
Summary:
|
|
575
|
+
Generate a dictionary from a Dataset.
|
|
576
|
+
|
|
577
|
+
Arguments:
|
|
578
|
+
dictionary_name (str | None): The name of the dictionary to generate.
|
|
579
|
+
If None, will not be set.
|
|
580
|
+
primary_keys (dict[str, list[str | int]] | None): A dictionary of primary keys
|
|
581
|
+
to set on the generated dictionary. If None, will not be set.
|
|
582
|
+
feedback (bool): Provide user feedback on progress (default: True)
|
|
583
|
+
debug (bool): Enable debug mode, providing full log of data type inference and
|
|
584
|
+
reasoning (default: False)
|
|
585
|
+
chunk_size (int | None): Size of chunks for reading data to optimise RAM usage,
|
|
586
|
+
if reading from CSV (default: 10_000_000)
|
|
587
|
+
sample_rows (int | None): Number of rows to sample for data type inference. Note:
|
|
588
|
+
this overrides `chunk_size` and reads in a single chunk (default: None)
|
|
589
|
+
|
|
590
|
+
Returns:
|
|
591
|
+
Dictionary: The generated dictionary.
|
|
592
|
+
"""
|
|
593
|
+
generator = Generator(
|
|
594
|
+
feedback=feedback,
|
|
595
|
+
debug=debug,
|
|
596
|
+
chunk_size=chunk_size,
|
|
597
|
+
sample_rows=sample_rows,
|
|
598
|
+
)
|
|
599
|
+
dictionary = generator.generate_dictionary(
|
|
600
|
+
self,
|
|
601
|
+
dictionary_name=dictionary_name,
|
|
602
|
+
primary_keys=primary_keys,
|
|
603
|
+
)
|
|
604
|
+
self.dictionary = dictionary
|
|
605
|
+
self._attach_table_dictionaries()
|
|
606
|
+
return dictionary
|
|
607
|
+
|
|
608
|
+
# Data
|
|
609
|
+
def import_data(
|
|
610
|
+
self,
|
|
611
|
+
name: str | None = None,
|
|
612
|
+
) -> None:
|
|
613
|
+
"""
|
|
614
|
+
Summary:
|
|
615
|
+
Import data from CSV files into the Dataset.
|
|
616
|
+
|
|
617
|
+
Arguments:
|
|
618
|
+
name (str | None): The name of the table to import data into. If None, all tables are imported.
|
|
619
|
+
|
|
620
|
+
Raises:
|
|
621
|
+
FileNotFoundError: If the file specified by name does not exist.
|
|
622
|
+
"""
|
|
623
|
+
if name:
|
|
624
|
+
self[name].import_data()
|
|
625
|
+
|
|
626
|
+
else:
|
|
627
|
+
for item in self:
|
|
628
|
+
if item.is_path:
|
|
629
|
+
item.import_data()
|
|
630
|
+
|
|
631
|
+
def export_data(
|
|
632
|
+
self,
|
|
633
|
+
directory: Path | str,
|
|
634
|
+
overwrite: bool = False,
|
|
635
|
+
enforce_validation: bool = True,
|
|
636
|
+
):
|
|
637
|
+
"""Export items from Dataset data to csv, if imported. Unimported items are
|
|
638
|
+
skipped. Unvalidated items are skipped if enforce_validation is True.
|
|
639
|
+
|
|
640
|
+
Args:
|
|
641
|
+
directory (Path | str): Directory to export csv files.
|
|
642
|
+
overwrite (bool, optional): Overwrite existing files on conflict. Defaults to False.
|
|
643
|
+
enforce_validation (bool, optional): Raise error if unvalidated. Defaults to True.
|
|
644
|
+
|
|
645
|
+
Raises:
|
|
646
|
+
ValueError: If files exists without overwrite=True.
|
|
647
|
+
"""
|
|
648
|
+
if not isinstance(directory, (Path, str)):
|
|
649
|
+
raise TypeError(f"directory must be a Path/str, not {type(directory)}")
|
|
650
|
+
print("Exporting data...")
|
|
651
|
+
# Check for issues
|
|
652
|
+
unimported_items = [item for item in self if item.is_path]
|
|
653
|
+
unvalidated_items = [
|
|
654
|
+
item for item in self if item.is_dataframe and not item.validated
|
|
655
|
+
]
|
|
656
|
+
|
|
657
|
+
if unimported_items:
|
|
658
|
+
print_bold_red("WARNING: Skipping unimported tables: ", end="")
|
|
659
|
+
print_red(list_as_bullets([item.name for item in unimported_items]))
|
|
660
|
+
|
|
661
|
+
if unvalidated_items and enforce_validation:
|
|
662
|
+
print_bold_red("WARNING: Skipping unvalidated tables: ", end="")
|
|
663
|
+
print_red(list_as_bullets([item.name for item in unvalidated_items]))
|
|
664
|
+
|
|
665
|
+
# Set exportable
|
|
666
|
+
exportable: list[DatasetItem] = []
|
|
667
|
+
for item in self:
|
|
668
|
+
if item.is_dataframe:
|
|
669
|
+
if item.validated or not enforce_validation:
|
|
670
|
+
exportable.append(item)
|
|
671
|
+
|
|
672
|
+
directory = Path(directory)
|
|
673
|
+
filenames = [directory / f"{item.name}.csv" for item in exportable]
|
|
674
|
+
|
|
675
|
+
# Check for conflicts and overwrite config
|
|
676
|
+
conflicts = [str(filename) for filename in filenames if filename.exists()]
|
|
677
|
+
if conflicts and not overwrite:
|
|
678
|
+
raise ValueError(
|
|
679
|
+
f"File exists and overwrite=False: {list_as_bullets(conflicts)}"
|
|
680
|
+
)
|
|
681
|
+
|
|
682
|
+
# Export
|
|
683
|
+
for item in exportable:
|
|
684
|
+
print(f" - exporting '{item.name}'")
|
|
685
|
+
item.export_data(directory, overwrite=overwrite, enforce_validation=False)
|
|
686
|
+
|
|
687
|
+
print(f"Export complete ({len(exportable)} tables)")
|
|
688
|
+
|
|
689
|
+
def apply_dictionary(self, name: str | None = None) -> None:
|
|
690
|
+
"""
|
|
691
|
+
Summary:
|
|
692
|
+
Apply a dictionary to a Dataset.
|
|
693
|
+
|
|
694
|
+
Arguments:
|
|
695
|
+
name (str | None): The name of the table to apply the dictionary to. If None, all tables are applied.
|
|
696
|
+
|
|
697
|
+
Returns:
|
|
698
|
+
None
|
|
699
|
+
|
|
700
|
+
Raises:
|
|
701
|
+
ValueError: If the Dataset does not contain a dictionary.
|
|
702
|
+
"""
|
|
703
|
+
if name:
|
|
704
|
+
self[name].apply_dictionary()
|
|
705
|
+
|
|
706
|
+
else:
|
|
707
|
+
for item in self:
|
|
708
|
+
item.apply_dictionary()
|
|
709
|
+
|
|
710
|
+
# Validation
|
|
711
|
+
def validate(
|
|
712
|
+
self,
|
|
713
|
+
chunk_size: int = None,
|
|
714
|
+
feedback: bool = True,
|
|
715
|
+
) -> None:
|
|
716
|
+
"""
|
|
717
|
+
Summary:
|
|
718
|
+
Validate data in the Dataset against the dictionary.
|
|
719
|
+
|
|
720
|
+
Arguments:
|
|
721
|
+
chunk_size (int): Size of chunks for validating data to optimise RAM usage.
|
|
722
|
+
feedback (bool): Provide user feedback on progress (default: True)
|
|
723
|
+
|
|
724
|
+
Returns:
|
|
725
|
+
None
|
|
726
|
+
|
|
727
|
+
Raises:
|
|
728
|
+
DataIntegrityError: If there is an issue with the integrity of the data
|
|
729
|
+
DataDictionaryImportError: If there is an issue with importing the dictionary
|
|
730
|
+
"""
|
|
731
|
+
if feedback:
|
|
732
|
+
print(f"Validating {len(self)} tables")
|
|
733
|
+
self.__check_dictionary()
|
|
734
|
+
for item in self:
|
|
735
|
+
try:
|
|
736
|
+
item.validate(
|
|
737
|
+
chunk_size=chunk_size,
|
|
738
|
+
feedback=feedback,
|
|
739
|
+
)
|
|
740
|
+
except DataIntegrityError:
|
|
741
|
+
pass
|
|
742
|
+
|
|
743
|
+
self.__reattach_issues()
|
|
744
|
+
|
|
745
|
+
# Report Issues
|
|
746
|
+
try:
|
|
747
|
+
self.check(readout=True)
|
|
748
|
+
except DataIntegrityError:
|
|
749
|
+
pass
|
|
750
|
+
|
|
751
|
+
if feedback:
|
|
752
|
+
print("\n", end="")
|
|
753
|
+
|
|
754
|
+
def __reattach_issues(self):
|
|
755
|
+
self.issues = Issues()
|
|
756
|
+
for item in self:
|
|
757
|
+
self.issues.extend(item.issues)
|
|
758
|
+
|
|
759
|
+
def __items_with_issues(self) -> list[str]:
|
|
760
|
+
items_with_issues = [item.name for item in self if len(item.issues) > 0]
|
|
761
|
+
string = (
|
|
762
|
+
("" + ",".join(items_with_issues) + "")
|
|
763
|
+
if len(items_with_issues) > 0
|
|
764
|
+
else ""
|
|
765
|
+
)
|
|
766
|
+
return string
|
|
767
|
+
|
|
768
|
+
def check(self, readout: bool = False) -> bool:
|
|
769
|
+
"""
|
|
770
|
+
Summary:
|
|
771
|
+
Check the validity of the Dataset.
|
|
772
|
+
|
|
773
|
+
Raises:
|
|
774
|
+
DataIntegrityError: If there is an issue with the integrity of the data, either because:
|
|
775
|
+
- the Dataset is not yet validated
|
|
776
|
+
- there are issues with the integrity of the data
|
|
777
|
+
"""
|
|
778
|
+
error = (
|
|
779
|
+
f"WARNING: Unvalidated tables or issues detected in {self.__items_with_issues()}:"
|
|
780
|
+
if len(self.issues) > 0
|
|
781
|
+
else "Dataset not yet validated"
|
|
782
|
+
if not self.validated
|
|
783
|
+
else ""
|
|
784
|
+
)
|
|
785
|
+
if error:
|
|
786
|
+
if readout:
|
|
787
|
+
print_bold_red(f"\n{error}")
|
|
788
|
+
print_red(self.issues)
|
|
789
|
+
raise DataIntegrityError(f"{error}\n{self.issues}")
|
|
790
|
+
else:
|
|
791
|
+
return True
|
|
792
|
+
|
|
793
|
+
# Creation Helpers
|
|
794
|
+
@staticmethod
|
|
795
|
+
def _make_item(
|
|
796
|
+
name: str | None,
|
|
797
|
+
data: DataLike,
|
|
798
|
+
) -> DatasetItem:
|
|
799
|
+
"""Normalise a (data, name) double into a DatasetItem."""
|
|
800
|
+
if isinstance(data, (str, Path)):
|
|
801
|
+
path = Path(data)
|
|
802
|
+
if not path.exists():
|
|
803
|
+
raise FileNotFoundError(f"File not found: {path}")
|
|
804
|
+
if path.suffix.lower() != ".csv":
|
|
805
|
+
raise ValueError(f"Only .csv supported right now, got: {path}")
|
|
806
|
+
resolved_name = _normalise_name(name or path.stem)
|
|
807
|
+
return DatasetItem(name=resolved_name, data=path.resolve())
|
|
808
|
+
|
|
809
|
+
if isinstance(data, DataFrame):
|
|
810
|
+
if not name:
|
|
811
|
+
raise ValueError("When providing a DataFrame, 'name' is required.")
|
|
812
|
+
resolved_name = _normalise_name(name)
|
|
813
|
+
return DatasetItem(name=resolved_name, data=data)
|
|
814
|
+
|
|
815
|
+
raise TypeError("data must be a Path/str to .csv or a pandas DataFrame.")
|
|
816
|
+
|
|
817
|
+
@staticmethod
|
|
818
|
+
def _items_from_pathlike(p: Path) -> list[DatasetItem]:
|
|
819
|
+
"""Expand a file/dir path into DatasetItems (non-recursive for dirs)."""
|
|
820
|
+
if not p.exists():
|
|
821
|
+
raise FileNotFoundError(f"Path not found: {p}")
|
|
822
|
+
|
|
823
|
+
if p.is_file():
|
|
824
|
+
if p.suffix.lower() != ".csv":
|
|
825
|
+
raise ValueError(f"Expected a .csv file, got: {p.suffix} ({p})")
|
|
826
|
+
return [DatasetItem(name=_normalise_name(p.stem), data=p.resolve())]
|
|
827
|
+
|
|
828
|
+
if p.is_dir():
|
|
829
|
+
return [
|
|
830
|
+
DatasetItem(
|
|
831
|
+
name=_normalise_name(csv_path.stem), data=csv_path.resolve()
|
|
832
|
+
)
|
|
833
|
+
for csv_path in p.glob("*.csv")
|
|
834
|
+
]
|
|
835
|
+
|
|
836
|
+
raise ValueError(f"Unsupported path type: {p}")
|
|
837
|
+
|
|
838
|
+
# Validation Helpers
|
|
839
|
+
def __check_dictionary(self):
|
|
840
|
+
if self.dictionary is None or not isinstance(self.dictionary, Dictionary):
|
|
841
|
+
raise DataDictionaryImportError(
|
|
842
|
+
"Data Dictionary not yet imported or generated. "
|
|
843
|
+
+ "Validation must first have a Data Dictionary. "
|
|
844
|
+
+ "Please first run DataSet.import_dictionary(), including `primary_keys`."
|
|
845
|
+
)
|
|
846
|
+
|
|
847
|
+
self.dictionary.check()
|
|
848
|
+
|
|
849
|
+
# Other Helpers
|
|
850
|
+
def __getitem__(self, key: int | str) -> DatasetItem:
|
|
851
|
+
if isinstance(key, int):
|
|
852
|
+
return super().__getitem__(key)
|
|
853
|
+
found = self.get(key)
|
|
854
|
+
if found is None:
|
|
855
|
+
raise KeyError(f"No DatasetItem with name '{key}'.")
|
|
856
|
+
return found
|
|
857
|
+
|
|
858
|
+
def _dd_loaded(self):
|
|
859
|
+
return self.dictionary is not None
|
|
860
|
+
|
|
861
|
+
def _attach_table_dictionaries(self):
|
|
862
|
+
for dataset_item in self:
|
|
863
|
+
table_name = dataset_item.name
|
|
864
|
+
table_dictionary = self.dictionary.get_table(table_name)
|
|
865
|
+
if not table_dictionary:
|
|
866
|
+
raise DataDictionaryImportError(
|
|
867
|
+
f"No dictionary table found for '{table_name}'"
|
|
868
|
+
)
|
|
869
|
+
|
|
870
|
+
dataset_item._attach_table_dictionary(table_dictionary)
|