valediction 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. valediction-1.0.0/.gitignore +72 -0
  2. valediction-1.0.0/PKG-INFO +15 -0
  3. valediction-1.0.0/pyproject.toml +57 -0
  4. valediction-1.0.0/src/valediction/__init__.py +8 -0
  5. valediction-1.0.0/src/valediction/convenience.py +50 -0
  6. valediction-1.0.0/src/valediction/data_types/__init__.py +0 -0
  7. valediction-1.0.0/src/valediction/data_types/data_type_helpers.py +75 -0
  8. valediction-1.0.0/src/valediction/data_types/data_types.py +58 -0
  9. valediction-1.0.0/src/valediction/data_types/type_inference.py +541 -0
  10. valediction-1.0.0/src/valediction/datasets/__init__.py +0 -0
  11. valediction-1.0.0/src/valediction/datasets/datasets.py +870 -0
  12. valediction-1.0.0/src/valediction/datasets/datasets_helpers.py +46 -0
  13. valediction-1.0.0/src/valediction/demo/DEMO - Data Dictionary.xlsx +0 -0
  14. valediction-1.0.0/src/valediction/demo/DEMOGRAPHICS.csv +101 -0
  15. valediction-1.0.0/src/valediction/demo/DIAGNOSES.csv +650 -0
  16. valediction-1.0.0/src/valediction/demo/LAB_TESTS.csv +1001 -0
  17. valediction-1.0.0/src/valediction/demo/VITALS.csv +1001 -0
  18. valediction-1.0.0/src/valediction/demo/__init__.py +6 -0
  19. valediction-1.0.0/src/valediction/demo/demo_dictionary.py +129 -0
  20. valediction-1.0.0/src/valediction/dictionary/__init__.py +0 -0
  21. valediction-1.0.0/src/valediction/dictionary/exporting.py +501 -0
  22. valediction-1.0.0/src/valediction/dictionary/exporting_helpers.py +371 -0
  23. valediction-1.0.0/src/valediction/dictionary/generation.py +357 -0
  24. valediction-1.0.0/src/valediction/dictionary/helpers.py +174 -0
  25. valediction-1.0.0/src/valediction/dictionary/importing.py +494 -0
  26. valediction-1.0.0/src/valediction/dictionary/integrity.py +37 -0
  27. valediction-1.0.0/src/valediction/dictionary/model.py +582 -0
  28. valediction-1.0.0/src/valediction/dictionary/template/PROJECT - Data Dictionary.xltx +0 -0
  29. valediction-1.0.0/src/valediction/exceptions.py +22 -0
  30. valediction-1.0.0/src/valediction/integrity.py +97 -0
  31. valediction-1.0.0/src/valediction/io/__init__.py +0 -0
  32. valediction-1.0.0/src/valediction/io/csv_readers.py +307 -0
  33. valediction-1.0.0/src/valediction/progress.py +206 -0
  34. valediction-1.0.0/src/valediction/support.py +72 -0
  35. valediction-1.0.0/src/valediction/validation/__init__.py +0 -0
  36. valediction-1.0.0/src/valediction/validation/helpers.py +315 -0
  37. valediction-1.0.0/src/valediction/validation/issues.py +280 -0
  38. valediction-1.0.0/src/valediction/validation/validation.py +598 -0
@@ -0,0 +1,72 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ tests/outputs
6
+ .ruff_cache
7
+ .pytest_cache
8
+
9
+ # C extensions
10
+ *.so
11
+
12
+ # Distribution / packaging
13
+ .Python
14
+ build/
15
+ develop-eggs/
16
+ dist/
17
+ downloads/
18
+ eggs/
19
+ .eggs/
20
+ lib/
21
+ lib64/
22
+ parts/
23
+ sdist/
24
+ var/
25
+ wheels/
26
+ pip-wheel-metadata/
27
+ share/python-wheels/
28
+ *.egg-info/
29
+ .installed.cfg
30
+ *.egg
31
+ MANIFEST
32
+ poetry.lock
33
+ uv.lock
34
+ .coverage
35
+
36
+ # mypy
37
+ .mypy_cache/
38
+ .dmypy.json
39
+ dmypy.json
40
+
41
+ # Secrets
42
+ *.env
43
+
44
+ # VSCode
45
+ *.vscode
46
+ *.code-workspace
47
+
48
+ # Virtual Envrionments
49
+ .venv
50
+ env/
51
+ venv/
52
+ ENV/
53
+ env.bak/
54
+ venv.bak/
55
+
56
+ # Other
57
+ *.xlsx
58
+ *.csv
59
+ *.ipynb
60
+ *.xltx
61
+
62
+ # Project Specific
63
+ notebooks/
64
+ exports/
65
+
66
+ # Project Keep
67
+ !src/valediction/dictionary/template/PROJECT - Data Dictionary.xltx
68
+ !src/valediction/demo/DEMO - Data Dictionary.xlsx
69
+ !src/valediction/demo/DEMOGRAPHICS.csv
70
+ !src/valediction/demo/DIAGNOSES.csv
71
+ !src/valediction/demo/LAB_TESTS.csv
72
+ !src/valediction/demo/VITALS.csv
@@ -0,0 +1,15 @@
1
+ Metadata-Version: 2.4
2
+ Name: valediction
3
+ Version: 1.0.0
4
+ Summary: Valediction is a convenience data validation package that allows generation, import, and constraint enforcement of user-defined data dictionaries against datasets.
5
+ Author-email: Cai Davis <Cai.Davis@uhs.nhs.uk>
6
+ Requires-Python: <4.0,>=3.11
7
+ Requires-Dist: certifi<2025,>=2024.2.2
8
+ Requires-Dist: mohawk<2,>=1.1.0
9
+ Requires-Dist: openpyxl<4,>=3.1.5
10
+ Requires-Dist: pandas<3,>=2.2.1
11
+ Requires-Dist: pydantic<3,>=2.11.4
12
+ Requires-Dist: requests<3,>=2.31.0
13
+ Requires-Dist: tabulate<0.10,>=0.9.0
14
+ Requires-Dist: tqdm>=4.67.1
15
+ Requires-Dist: xlsxwriter<4,>=3.2.3
@@ -0,0 +1,57 @@
1
+ [project]
2
+ name = "valediction"
3
+ version = "1.0.0"
4
+ description = "Valediction is a convenience data validation package that allows generation, import, and constraint enforcement of user-defined data dictionaries against datasets."
5
+ authors = [{ name = "Cai Davis", email = "Cai.Davis@uhs.nhs.uk" }]
6
+ requires-python = ">=3.11,<4.0"
7
+ dependencies = [
8
+ "pandas>=2.2.1,<3",
9
+ "requests>=2.31.0,<3",
10
+ "mohawk>=1.1.0,<2",
11
+ "certifi>=2024.2.2,<2025",
12
+ "openpyxl>=3.1.5,<4",
13
+ "xlsxwriter>=3.2.3,<4",
14
+ "pydantic>=2.11.4,<3",
15
+ "tabulate>=0.9.0,<0.10",
16
+ "tqdm>=4.67.1",
17
+ ]
18
+
19
+ [dependency-groups]
20
+ dev = [
21
+ "pytest>=7.3.1,<8",
22
+ "pytest-cov>=4.0.0,<5",
23
+ "commitizen>=3.13.0,<4",
24
+ "black>=23.12.1,<24",
25
+ "python-dotenv>=1.0.1,<2",
26
+ "ipykernel>=6.29.5,<7",
27
+ "faker>=37.4.0,<38",
28
+ "mypy>=1.18.2,<2",
29
+ "types-openpyxl (>=3.1.5.20250919,<4.0.0.0)",
30
+ "pandas-stubs (>=2.3.2.250827,<3.0.0.0)",
31
+ "pre-commit>=4.3.0",
32
+ ]
33
+
34
+ [tool.hatch.build.targets.sdist]
35
+ include = [
36
+ "src/valediction",
37
+ ]
38
+
39
+ [tool.hatch.build.targets.wheel]
40
+ include = [
41
+ "src/valediction",
42
+ ]
43
+
44
+ [tool.hatch.build.targets.wheel.sources]
45
+ "src/valediction" = "valediction"
46
+
47
+ [build-system]
48
+ requires = ["hatchling"]
49
+ build-backend = "hatchling.build"
50
+
51
+ [tool.commitizen]
52
+ name = "cz_conventional_commits"
53
+ tag_format = "v$major.$minor.$patch$prerelease"
54
+ version_type = "pep440"
55
+ version_provider = "pep621"
56
+ update_changelog_on_bump = true
57
+ major_version_zero = true
@@ -0,0 +1,8 @@
1
+ from valediction.datasets.datasets import Dataset # noqa
2
+ from valediction.dictionary.importing import import_dictionary # noqa
3
+ from valediction.dictionary.exporting import export_dictionary # noqa
4
+ from valediction.dictionary.model import Dictionary, Table, Column # noqa
5
+ from valediction import demo # noqa
6
+ from valediction.convenience import validate # noqa
7
+ from valediction.integrity import get_config, reset_default_config, Config # noqa
8
+ from valediction.data_types.data_types import DataType # noqa
@@ -0,0 +1,50 @@
1
+ from pathlib import Path
2
+
3
+ from pandas import DataFrame
4
+
5
+ from valediction.datasets.datasets import Dataset
6
+ from valediction.dictionary.importing import import_dictionary
7
+ from valediction.dictionary.model import Dictionary
8
+
9
+
10
+ def validate(
11
+ data: str | Path | dict[str, DataFrame] | Dictionary,
12
+ dictionary: Dictionary | str | Path,
13
+ *,
14
+ import_data: bool = False,
15
+ chunk_size: int | None = 10_000_000,
16
+ feedback: bool = True,
17
+ ) -> Dataset:
18
+ """Validate the dataset against the dictionary. Run dataset.check() afterwards to
19
+ raise Exception if issues.
20
+
21
+ Arguments:
22
+ dataset (str | Path | dict[str, DataFrame]): path to CSV, DataFrame, or dictionary of table names
23
+ to DataFrames
24
+ dictionary (Dictionary | str | Path): dictionary to validate against as a Dictionary object
25
+ or .xlsx filepath
26
+ import_data (bool, optional): whether to load all data into memory. Defaults to False.
27
+ chunk_size (int | None, optional): size of chunks for validating data to optimise RAM usage.
28
+ Defaults to 10_000_000.
29
+ feedback (bool, optional): whether to provide user feedback on progress. Defaults to True.
30
+
31
+ Returns:
32
+ Dataset: dataset, with or without Issues
33
+ """
34
+ dictionary = (
35
+ dictionary
36
+ if isinstance(dictionary, Dictionary)
37
+ else import_dictionary(dictionary)
38
+ )
39
+ data: Dataset = Dataset.create_from(data)
40
+ data.import_dictionary(dictionary)
41
+
42
+ if import_data:
43
+ data.import_data()
44
+
45
+ data.validate(
46
+ chunk_size=chunk_size,
47
+ feedback=feedback,
48
+ )
49
+
50
+ return data
@@ -0,0 +1,75 @@
1
+ from pandas import NA, Series, to_datetime
2
+
3
+ from valediction.data_types.data_types import DataType
4
+ from valediction.integrity import get_config
5
+
6
+
7
+ def infer_datetime_format(
8
+ series: Series,
9
+ slice_sample_size: int = 100,
10
+ ) -> str | None:
11
+ """Efficiently infers date/datetime format (or rules out) by looping over slices of
12
+ the column series, ruling out formats.
13
+
14
+ Args:
15
+ series (Series): Column data series
16
+ slice_sample_size (int, optional): Number of rows to test in each slice
17
+ loop. Defaults to 100.
18
+
19
+ Raises:
20
+ ValueError: No values, or ambiguous format after full scan.
21
+
22
+ Returns:
23
+ str | None: datetime format string, or None if no format matches.
24
+ """
25
+ datetime_formats = get_config().date_formats.keys()
26
+ values = series.str.strip().replace("", NA).dropna()
27
+ if values.empty:
28
+ raise ValueError("Series has no non-null values to test.")
29
+
30
+ start_i = 0
31
+ total = len(values)
32
+ last_ambiguous: list[str] | None = None
33
+ remaining = list(datetime_formats)
34
+
35
+ # loop over slices
36
+ while start_i < total:
37
+ end_i = min(start_i + slice_sample_size, total)
38
+ sample = values.iloc[start_i:end_i]
39
+
40
+ valid_formats: list[str] = []
41
+ for fmt in remaining:
42
+ try:
43
+ to_datetime(sample, format=fmt, errors="raise")
44
+ valid_formats.append(fmt)
45
+ except Exception:
46
+ pass
47
+
48
+ remaining = valid_formats
49
+
50
+ # Decision
51
+ current = valid_formats
52
+ if len(current) == 1:
53
+ return current[0]
54
+ elif len(current) == 0:
55
+ return None
56
+ else:
57
+ last_ambiguous = current
58
+ start_i = end_i # advance to next slice
59
+
60
+ # all values scanned and format still ambiguous
61
+ raise ValueError(f"Ambiguous datetime format after scanning: {last_ambiguous}")
62
+
63
+
64
+ def get_date_type(datetime_format: str) -> DataType | None:
65
+ """Identifies if a datetime format string corresponds to a Date or Datetime data
66
+ type.
67
+
68
+ Args:
69
+ datetime_format (str): datetime format string
70
+
71
+ Returns:
72
+ DataType | None: DataType of Date, Datetime, or None if not found.
73
+ """
74
+ config = get_config()
75
+ return config.date_formats.get(datetime_format)
@@ -0,0 +1,58 @@
1
+ from __future__ import annotations
2
+
3
+ from enum import Enum
4
+
5
+
6
+ class DataType(Enum):
7
+ TEXT = "Text"
8
+ INTEGER = "Integer"
9
+ FLOAT = "Float"
10
+ DATE = "Date"
11
+ DATETIME = "Datetime"
12
+ FILE = "File"
13
+
14
+ def __str__(self) -> str:
15
+ return self.value
16
+
17
+ def __repr__(self) -> str:
18
+ return self.value
19
+
20
+ @classmethod
21
+ def parse(cls, data_type: str) -> DataType:
22
+ """Case-insensitive, forgiving parser."""
23
+ normalised = (data_type or "").strip().lower()
24
+ aliases = {
25
+ "text": cls.TEXT,
26
+ "string": cls.TEXT,
27
+ "str": cls.TEXT,
28
+ "int": cls.INTEGER,
29
+ "integer": cls.INTEGER,
30
+ "float": cls.FLOAT,
31
+ "double": cls.FLOAT,
32
+ "number": cls.FLOAT,
33
+ "numeric": cls.FLOAT,
34
+ "date": cls.DATE,
35
+ "datetime": cls.DATETIME,
36
+ "datetime64": cls.DATETIME,
37
+ "timestamp": cls.DATETIME,
38
+ "file": cls.FILE,
39
+ "blob": cls.FILE,
40
+ "binary": cls.FILE,
41
+ }
42
+ try:
43
+ return aliases[normalised]
44
+ except KeyError as error:
45
+ raise ValueError(f"Unknown data type: {data_type!r}") from error
46
+
47
+ def allows_length(self) -> bool:
48
+ """Only TEXT should have a length attribute."""
49
+ return self in {DataType.TEXT}
50
+
51
+ def valid_for_primary_key(self) -> bool:
52
+ """PKs can only be Text, Integer, Date, Datetime."""
53
+ return self in {
54
+ DataType.TEXT,
55
+ DataType.INTEGER,
56
+ DataType.DATE,
57
+ DataType.DATETIME,
58
+ }