valediction 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- valediction/__init__.py +8 -0
- valediction/convenience.py +50 -0
- valediction/data_types/__init__.py +0 -0
- valediction/data_types/data_type_helpers.py +75 -0
- valediction/data_types/data_types.py +58 -0
- valediction/data_types/type_inference.py +541 -0
- valediction/datasets/__init__.py +0 -0
- valediction/datasets/datasets.py +870 -0
- valediction/datasets/datasets_helpers.py +46 -0
- valediction/demo/DEMO - Data Dictionary.xlsx +0 -0
- valediction/demo/DEMOGRAPHICS.csv +101 -0
- valediction/demo/DIAGNOSES.csv +650 -0
- valediction/demo/LAB_TESTS.csv +1001 -0
- valediction/demo/VITALS.csv +1001 -0
- valediction/demo/__init__.py +6 -0
- valediction/demo/demo_dictionary.py +129 -0
- valediction/dictionary/__init__.py +0 -0
- valediction/dictionary/exporting.py +501 -0
- valediction/dictionary/exporting_helpers.py +371 -0
- valediction/dictionary/generation.py +357 -0
- valediction/dictionary/helpers.py +174 -0
- valediction/dictionary/importing.py +494 -0
- valediction/dictionary/integrity.py +37 -0
- valediction/dictionary/model.py +582 -0
- valediction/dictionary/template/PROJECT - Data Dictionary.xltx +0 -0
- valediction/exceptions.py +22 -0
- valediction/integrity.py +97 -0
- valediction/io/__init__.py +0 -0
- valediction/io/csv_readers.py +307 -0
- valediction/progress.py +206 -0
- valediction/support.py +72 -0
- valediction/validation/__init__.py +0 -0
- valediction/validation/helpers.py +315 -0
- valediction/validation/issues.py +280 -0
- valediction/validation/validation.py +598 -0
- valediction-1.0.0.dist-info/METADATA +15 -0
- valediction-1.0.0.dist-info/RECORD +38 -0
- valediction-1.0.0.dist-info/WHEEL +4 -0
valediction/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
from valediction.datasets.datasets import Dataset # noqa
|
|
2
|
+
from valediction.dictionary.importing import import_dictionary # noqa
|
|
3
|
+
from valediction.dictionary.exporting import export_dictionary # noqa
|
|
4
|
+
from valediction.dictionary.model import Dictionary, Table, Column # noqa
|
|
5
|
+
from valediction import demo # noqa
|
|
6
|
+
from valediction.convenience import validate # noqa
|
|
7
|
+
from valediction.integrity import get_config, reset_default_config, Config # noqa
|
|
8
|
+
from valediction.data_types.data_types import DataType # noqa
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from pandas import DataFrame
|
|
4
|
+
|
|
5
|
+
from valediction.datasets.datasets import Dataset
|
|
6
|
+
from valediction.dictionary.importing import import_dictionary
|
|
7
|
+
from valediction.dictionary.model import Dictionary
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def validate(
|
|
11
|
+
data: str | Path | dict[str, DataFrame] | Dictionary,
|
|
12
|
+
dictionary: Dictionary | str | Path,
|
|
13
|
+
*,
|
|
14
|
+
import_data: bool = False,
|
|
15
|
+
chunk_size: int | None = 10_000_000,
|
|
16
|
+
feedback: bool = True,
|
|
17
|
+
) -> Dataset:
|
|
18
|
+
"""Validate the dataset against the dictionary. Run dataset.check() afterwards to
|
|
19
|
+
raise Exception if issues.
|
|
20
|
+
|
|
21
|
+
Arguments:
|
|
22
|
+
dataset (str | Path | dict[str, DataFrame]): path to CSV, DataFrame, or dictionary of table names
|
|
23
|
+
to DataFrames
|
|
24
|
+
dictionary (Dictionary | str | Path): dictionary to validate against as a Dictionary object
|
|
25
|
+
or .xlsx filepath
|
|
26
|
+
import_data (bool, optional): whether to load all data into memory. Defaults to False.
|
|
27
|
+
chunk_size (int | None, optional): size of chunks for validating data to optimise RAM usage.
|
|
28
|
+
Defaults to 10_000_000.
|
|
29
|
+
feedback (bool, optional): whether to provide user feedback on progress. Defaults to True.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
Dataset: dataset, with or without Issues
|
|
33
|
+
"""
|
|
34
|
+
dictionary = (
|
|
35
|
+
dictionary
|
|
36
|
+
if isinstance(dictionary, Dictionary)
|
|
37
|
+
else import_dictionary(dictionary)
|
|
38
|
+
)
|
|
39
|
+
data: Dataset = Dataset.create_from(data)
|
|
40
|
+
data.import_dictionary(dictionary)
|
|
41
|
+
|
|
42
|
+
if import_data:
|
|
43
|
+
data.import_data()
|
|
44
|
+
|
|
45
|
+
data.validate(
|
|
46
|
+
chunk_size=chunk_size,
|
|
47
|
+
feedback=feedback,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
return data
|
|
File without changes
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
from pandas import NA, Series, to_datetime
|
|
2
|
+
|
|
3
|
+
from valediction.data_types.data_types import DataType
|
|
4
|
+
from valediction.integrity import get_config
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def infer_datetime_format(
|
|
8
|
+
series: Series,
|
|
9
|
+
slice_sample_size: int = 100,
|
|
10
|
+
) -> str | None:
|
|
11
|
+
"""Efficiently infers date/datetime format (or rules out) by looping over slices of
|
|
12
|
+
the column series, ruling out formats.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
series (Series): Column data series
|
|
16
|
+
slice_sample_size (int, optional): Number of rows to test in each slice
|
|
17
|
+
loop. Defaults to 100.
|
|
18
|
+
|
|
19
|
+
Raises:
|
|
20
|
+
ValueError: No values, or ambiguous format after full scan.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
str | None: datetime format string, or None if no format matches.
|
|
24
|
+
"""
|
|
25
|
+
datetime_formats = get_config().date_formats.keys()
|
|
26
|
+
values = series.str.strip().replace("", NA).dropna()
|
|
27
|
+
if values.empty:
|
|
28
|
+
raise ValueError("Series has no non-null values to test.")
|
|
29
|
+
|
|
30
|
+
start_i = 0
|
|
31
|
+
total = len(values)
|
|
32
|
+
last_ambiguous: list[str] | None = None
|
|
33
|
+
remaining = list(datetime_formats)
|
|
34
|
+
|
|
35
|
+
# loop over slices
|
|
36
|
+
while start_i < total:
|
|
37
|
+
end_i = min(start_i + slice_sample_size, total)
|
|
38
|
+
sample = values.iloc[start_i:end_i]
|
|
39
|
+
|
|
40
|
+
valid_formats: list[str] = []
|
|
41
|
+
for fmt in remaining:
|
|
42
|
+
try:
|
|
43
|
+
to_datetime(sample, format=fmt, errors="raise")
|
|
44
|
+
valid_formats.append(fmt)
|
|
45
|
+
except Exception:
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
remaining = valid_formats
|
|
49
|
+
|
|
50
|
+
# Decision
|
|
51
|
+
current = valid_formats
|
|
52
|
+
if len(current) == 1:
|
|
53
|
+
return current[0]
|
|
54
|
+
elif len(current) == 0:
|
|
55
|
+
return None
|
|
56
|
+
else:
|
|
57
|
+
last_ambiguous = current
|
|
58
|
+
start_i = end_i # advance to next slice
|
|
59
|
+
|
|
60
|
+
# all values scanned and format still ambiguous
|
|
61
|
+
raise ValueError(f"Ambiguous datetime format after scanning: {last_ambiguous}")
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def get_date_type(datetime_format: str) -> DataType | None:
|
|
65
|
+
"""Identifies if a datetime format string corresponds to a Date or Datetime data
|
|
66
|
+
type.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
datetime_format (str): datetime format string
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
DataType | None: DataType of Date, Datetime, or None if not found.
|
|
73
|
+
"""
|
|
74
|
+
config = get_config()
|
|
75
|
+
return config.date_formats.get(datetime_format)
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from enum import Enum
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class DataType(Enum):
|
|
7
|
+
TEXT = "Text"
|
|
8
|
+
INTEGER = "Integer"
|
|
9
|
+
FLOAT = "Float"
|
|
10
|
+
DATE = "Date"
|
|
11
|
+
DATETIME = "Datetime"
|
|
12
|
+
FILE = "File"
|
|
13
|
+
|
|
14
|
+
def __str__(self) -> str:
|
|
15
|
+
return self.value
|
|
16
|
+
|
|
17
|
+
def __repr__(self) -> str:
|
|
18
|
+
return self.value
|
|
19
|
+
|
|
20
|
+
@classmethod
|
|
21
|
+
def parse(cls, data_type: str) -> DataType:
|
|
22
|
+
"""Case-insensitive, forgiving parser."""
|
|
23
|
+
normalised = (data_type or "").strip().lower()
|
|
24
|
+
aliases = {
|
|
25
|
+
"text": cls.TEXT,
|
|
26
|
+
"string": cls.TEXT,
|
|
27
|
+
"str": cls.TEXT,
|
|
28
|
+
"int": cls.INTEGER,
|
|
29
|
+
"integer": cls.INTEGER,
|
|
30
|
+
"float": cls.FLOAT,
|
|
31
|
+
"double": cls.FLOAT,
|
|
32
|
+
"number": cls.FLOAT,
|
|
33
|
+
"numeric": cls.FLOAT,
|
|
34
|
+
"date": cls.DATE,
|
|
35
|
+
"datetime": cls.DATETIME,
|
|
36
|
+
"datetime64": cls.DATETIME,
|
|
37
|
+
"timestamp": cls.DATETIME,
|
|
38
|
+
"file": cls.FILE,
|
|
39
|
+
"blob": cls.FILE,
|
|
40
|
+
"binary": cls.FILE,
|
|
41
|
+
}
|
|
42
|
+
try:
|
|
43
|
+
return aliases[normalised]
|
|
44
|
+
except KeyError as error:
|
|
45
|
+
raise ValueError(f"Unknown data type: {data_type!r}") from error
|
|
46
|
+
|
|
47
|
+
def allows_length(self) -> bool:
|
|
48
|
+
"""Only TEXT should have a length attribute."""
|
|
49
|
+
return self in {DataType.TEXT}
|
|
50
|
+
|
|
51
|
+
def valid_for_primary_key(self) -> bool:
|
|
52
|
+
"""PKs can only be Text, Integer, Date, Datetime."""
|
|
53
|
+
return self in {
|
|
54
|
+
DataType.TEXT,
|
|
55
|
+
DataType.INTEGER,
|
|
56
|
+
DataType.DATE,
|
|
57
|
+
DataType.DATETIME,
|
|
58
|
+
}
|