valediction 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. valediction/__init__.py +8 -0
  2. valediction/convenience.py +50 -0
  3. valediction/data_types/__init__.py +0 -0
  4. valediction/data_types/data_type_helpers.py +75 -0
  5. valediction/data_types/data_types.py +58 -0
  6. valediction/data_types/type_inference.py +541 -0
  7. valediction/datasets/__init__.py +0 -0
  8. valediction/datasets/datasets.py +870 -0
  9. valediction/datasets/datasets_helpers.py +46 -0
  10. valediction/demo/DEMO - Data Dictionary.xlsx +0 -0
  11. valediction/demo/DEMOGRAPHICS.csv +101 -0
  12. valediction/demo/DIAGNOSES.csv +650 -0
  13. valediction/demo/LAB_TESTS.csv +1001 -0
  14. valediction/demo/VITALS.csv +1001 -0
  15. valediction/demo/__init__.py +6 -0
  16. valediction/demo/demo_dictionary.py +129 -0
  17. valediction/dictionary/__init__.py +0 -0
  18. valediction/dictionary/exporting.py +501 -0
  19. valediction/dictionary/exporting_helpers.py +371 -0
  20. valediction/dictionary/generation.py +357 -0
  21. valediction/dictionary/helpers.py +174 -0
  22. valediction/dictionary/importing.py +494 -0
  23. valediction/dictionary/integrity.py +37 -0
  24. valediction/dictionary/model.py +582 -0
  25. valediction/dictionary/template/PROJECT - Data Dictionary.xltx +0 -0
  26. valediction/exceptions.py +22 -0
  27. valediction/integrity.py +97 -0
  28. valediction/io/__init__.py +0 -0
  29. valediction/io/csv_readers.py +307 -0
  30. valediction/progress.py +206 -0
  31. valediction/support.py +72 -0
  32. valediction/validation/__init__.py +0 -0
  33. valediction/validation/helpers.py +315 -0
  34. valediction/validation/issues.py +280 -0
  35. valediction/validation/validation.py +598 -0
  36. valediction-1.0.0.dist-info/METADATA +15 -0
  37. valediction-1.0.0.dist-info/RECORD +38 -0
  38. valediction-1.0.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,8 @@
1
+ from valediction.datasets.datasets import Dataset # noqa
2
+ from valediction.dictionary.importing import import_dictionary # noqa
3
+ from valediction.dictionary.exporting import export_dictionary # noqa
4
+ from valediction.dictionary.model import Dictionary, Table, Column # noqa
5
+ from valediction import demo # noqa
6
+ from valediction.convenience import validate # noqa
7
+ from valediction.integrity import get_config, reset_default_config, Config # noqa
8
+ from valediction.data_types.data_types import DataType # noqa
@@ -0,0 +1,50 @@
1
+ from pathlib import Path
2
+
3
+ from pandas import DataFrame
4
+
5
+ from valediction.datasets.datasets import Dataset
6
+ from valediction.dictionary.importing import import_dictionary
7
+ from valediction.dictionary.model import Dictionary
8
+
9
+
10
+ def validate(
11
+ data: str | Path | dict[str, DataFrame] | Dictionary,
12
+ dictionary: Dictionary | str | Path,
13
+ *,
14
+ import_data: bool = False,
15
+ chunk_size: int | None = 10_000_000,
16
+ feedback: bool = True,
17
+ ) -> Dataset:
18
+ """Validate the dataset against the dictionary. Run dataset.check() afterwards to
19
+ raise Exception if issues.
20
+
21
+ Arguments:
22
+ dataset (str | Path | dict[str, DataFrame]): path to CSV, DataFrame, or dictionary of table names
23
+ to DataFrames
24
+ dictionary (Dictionary | str | Path): dictionary to validate against as a Dictionary object
25
+ or .xlsx filepath
26
+ import_data (bool, optional): whether to load all data into memory. Defaults to False.
27
+ chunk_size (int | None, optional): size of chunks for validating data to optimise RAM usage.
28
+ Defaults to 10_000_000.
29
+ feedback (bool, optional): whether to provide user feedback on progress. Defaults to True.
30
+
31
+ Returns:
32
+ Dataset: dataset, with or without Issues
33
+ """
34
+ dictionary = (
35
+ dictionary
36
+ if isinstance(dictionary, Dictionary)
37
+ else import_dictionary(dictionary)
38
+ )
39
+ data: Dataset = Dataset.create_from(data)
40
+ data.import_dictionary(dictionary)
41
+
42
+ if import_data:
43
+ data.import_data()
44
+
45
+ data.validate(
46
+ chunk_size=chunk_size,
47
+ feedback=feedback,
48
+ )
49
+
50
+ return data
File without changes
@@ -0,0 +1,75 @@
1
+ from pandas import NA, Series, to_datetime
2
+
3
+ from valediction.data_types.data_types import DataType
4
+ from valediction.integrity import get_config
5
+
6
+
7
+ def infer_datetime_format(
8
+ series: Series,
9
+ slice_sample_size: int = 100,
10
+ ) -> str | None:
11
+ """Efficiently infers date/datetime format (or rules out) by looping over slices of
12
+ the column series, ruling out formats.
13
+
14
+ Args:
15
+ series (Series): Column data series
16
+ slice_sample_size (int, optional): Number of rows to test in each slice
17
+ loop. Defaults to 100.
18
+
19
+ Raises:
20
+ ValueError: No values, or ambiguous format after full scan.
21
+
22
+ Returns:
23
+ str | None: datetime format string, or None if no format matches.
24
+ """
25
+ datetime_formats = get_config().date_formats.keys()
26
+ values = series.str.strip().replace("", NA).dropna()
27
+ if values.empty:
28
+ raise ValueError("Series has no non-null values to test.")
29
+
30
+ start_i = 0
31
+ total = len(values)
32
+ last_ambiguous: list[str] | None = None
33
+ remaining = list(datetime_formats)
34
+
35
+ # loop over slices
36
+ while start_i < total:
37
+ end_i = min(start_i + slice_sample_size, total)
38
+ sample = values.iloc[start_i:end_i]
39
+
40
+ valid_formats: list[str] = []
41
+ for fmt in remaining:
42
+ try:
43
+ to_datetime(sample, format=fmt, errors="raise")
44
+ valid_formats.append(fmt)
45
+ except Exception:
46
+ pass
47
+
48
+ remaining = valid_formats
49
+
50
+ # Decision
51
+ current = valid_formats
52
+ if len(current) == 1:
53
+ return current[0]
54
+ elif len(current) == 0:
55
+ return None
56
+ else:
57
+ last_ambiguous = current
58
+ start_i = end_i # advance to next slice
59
+
60
+ # all values scanned and format still ambiguous
61
+ raise ValueError(f"Ambiguous datetime format after scanning: {last_ambiguous}")
62
+
63
+
64
+ def get_date_type(datetime_format: str) -> DataType | None:
65
+ """Identifies if a datetime format string corresponds to a Date or Datetime data
66
+ type.
67
+
68
+ Args:
69
+ datetime_format (str): datetime format string
70
+
71
+ Returns:
72
+ DataType | None: DataType of Date, Datetime, or None if not found.
73
+ """
74
+ config = get_config()
75
+ return config.date_formats.get(datetime_format)
@@ -0,0 +1,58 @@
1
+ from __future__ import annotations
2
+
3
+ from enum import Enum
4
+
5
+
6
+ class DataType(Enum):
7
+ TEXT = "Text"
8
+ INTEGER = "Integer"
9
+ FLOAT = "Float"
10
+ DATE = "Date"
11
+ DATETIME = "Datetime"
12
+ FILE = "File"
13
+
14
+ def __str__(self) -> str:
15
+ return self.value
16
+
17
+ def __repr__(self) -> str:
18
+ return self.value
19
+
20
+ @classmethod
21
+ def parse(cls, data_type: str) -> DataType:
22
+ """Case-insensitive, forgiving parser."""
23
+ normalised = (data_type or "").strip().lower()
24
+ aliases = {
25
+ "text": cls.TEXT,
26
+ "string": cls.TEXT,
27
+ "str": cls.TEXT,
28
+ "int": cls.INTEGER,
29
+ "integer": cls.INTEGER,
30
+ "float": cls.FLOAT,
31
+ "double": cls.FLOAT,
32
+ "number": cls.FLOAT,
33
+ "numeric": cls.FLOAT,
34
+ "date": cls.DATE,
35
+ "datetime": cls.DATETIME,
36
+ "datetime64": cls.DATETIME,
37
+ "timestamp": cls.DATETIME,
38
+ "file": cls.FILE,
39
+ "blob": cls.FILE,
40
+ "binary": cls.FILE,
41
+ }
42
+ try:
43
+ return aliases[normalised]
44
+ except KeyError as error:
45
+ raise ValueError(f"Unknown data type: {data_type!r}") from error
46
+
47
+ def allows_length(self) -> bool:
48
+ """Only TEXT should have a length attribute."""
49
+ return self in {DataType.TEXT}
50
+
51
+ def valid_for_primary_key(self) -> bool:
52
+ """PKs can only be Text, Integer, Date, Datetime."""
53
+ return self in {
54
+ DataType.TEXT,
55
+ DataType.INTEGER,
56
+ DataType.DATE,
57
+ DataType.DATETIME,
58
+ }