pointblank 0.18.0__py3-none-any.whl → 0.20.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +44 -1
- pointblank/_constants.py +258 -166
- pointblank/_constants_translations.py +378 -0
- pointblank/_interrogation.py +204 -0
- pointblank/_utils_llms_txt.py +20 -0
- pointblank/data/api-docs.txt +793 -1
- pointblank/field.py +1507 -0
- pointblank/generate/__init__.py +17 -0
- pointblank/generate/base.py +49 -0
- pointblank/generate/generators.py +573 -0
- pointblank/generate/regex.py +217 -0
- pointblank/locales/__init__.py +1476 -0
- pointblank/locales/data/AR/address.json +73 -0
- pointblank/locales/data/AR/company.json +60 -0
- pointblank/locales/data/AR/internet.json +19 -0
- pointblank/locales/data/AR/misc.json +7 -0
- pointblank/locales/data/AR/person.json +39 -0
- pointblank/locales/data/AR/text.json +38 -0
- pointblank/locales/data/AT/address.json +84 -0
- pointblank/locales/data/AT/company.json +65 -0
- pointblank/locales/data/AT/internet.json +20 -0
- pointblank/locales/data/AT/misc.json +8 -0
- pointblank/locales/data/AT/person.json +17 -0
- pointblank/locales/data/AT/text.json +35 -0
- pointblank/locales/data/AU/address.json +83 -0
- pointblank/locales/data/AU/company.json +65 -0
- pointblank/locales/data/AU/internet.json +20 -0
- pointblank/locales/data/AU/misc.json +8 -0
- pointblank/locales/data/AU/person.json +17 -0
- pointblank/locales/data/AU/text.json +35 -0
- pointblank/locales/data/BE/address.json +225 -0
- pointblank/locales/data/BE/company.json +129 -0
- pointblank/locales/data/BE/internet.json +36 -0
- pointblank/locales/data/BE/misc.json +6 -0
- pointblank/locales/data/BE/person.json +62 -0
- pointblank/locales/data/BE/text.json +38 -0
- pointblank/locales/data/BG/address.json +75 -0
- pointblank/locales/data/BG/company.json +60 -0
- pointblank/locales/data/BG/internet.json +19 -0
- pointblank/locales/data/BG/misc.json +7 -0
- pointblank/locales/data/BG/person.json +40 -0
- pointblank/locales/data/BG/text.json +38 -0
- pointblank/locales/data/BR/address.json +98 -0
- pointblank/locales/data/BR/company.json +65 -0
- pointblank/locales/data/BR/internet.json +20 -0
- pointblank/locales/data/BR/misc.json +8 -0
- pointblank/locales/data/BR/person.json +17 -0
- pointblank/locales/data/BR/text.json +35 -0
- pointblank/locales/data/CA/address.json +747 -0
- pointblank/locales/data/CA/company.json +120 -0
- pointblank/locales/data/CA/internet.json +24 -0
- pointblank/locales/data/CA/misc.json +11 -0
- pointblank/locales/data/CA/person.json +1033 -0
- pointblank/locales/data/CA/text.json +58 -0
- pointblank/locales/data/CH/address.json +184 -0
- pointblank/locales/data/CH/company.json +112 -0
- pointblank/locales/data/CH/internet.json +20 -0
- pointblank/locales/data/CH/misc.json +10 -0
- pointblank/locales/data/CH/person.json +64 -0
- pointblank/locales/data/CH/text.json +45 -0
- pointblank/locales/data/CL/address.json +71 -0
- pointblank/locales/data/CL/company.json +60 -0
- pointblank/locales/data/CL/internet.json +19 -0
- pointblank/locales/data/CL/misc.json +7 -0
- pointblank/locales/data/CL/person.json +38 -0
- pointblank/locales/data/CL/text.json +38 -0
- pointblank/locales/data/CN/address.json +124 -0
- pointblank/locales/data/CN/company.json +76 -0
- pointblank/locales/data/CN/internet.json +20 -0
- pointblank/locales/data/CN/misc.json +8 -0
- pointblank/locales/data/CN/person.json +50 -0
- pointblank/locales/data/CN/text.json +38 -0
- pointblank/locales/data/CO/address.json +76 -0
- pointblank/locales/data/CO/company.json +60 -0
- pointblank/locales/data/CO/internet.json +19 -0
- pointblank/locales/data/CO/misc.json +7 -0
- pointblank/locales/data/CO/person.json +38 -0
- pointblank/locales/data/CO/text.json +38 -0
- pointblank/locales/data/CY/address.json +62 -0
- pointblank/locales/data/CY/company.json +60 -0
- pointblank/locales/data/CY/internet.json +19 -0
- pointblank/locales/data/CY/misc.json +7 -0
- pointblank/locales/data/CY/person.json +38 -0
- pointblank/locales/data/CY/text.json +38 -0
- pointblank/locales/data/CZ/address.json +70 -0
- pointblank/locales/data/CZ/company.json +61 -0
- pointblank/locales/data/CZ/internet.json +19 -0
- pointblank/locales/data/CZ/misc.json +7 -0
- pointblank/locales/data/CZ/person.json +40 -0
- pointblank/locales/data/CZ/text.json +38 -0
- pointblank/locales/data/DE/address.json +756 -0
- pointblank/locales/data/DE/company.json +101 -0
- pointblank/locales/data/DE/internet.json +22 -0
- pointblank/locales/data/DE/misc.json +11 -0
- pointblank/locales/data/DE/person.json +1026 -0
- pointblank/locales/data/DE/text.json +50 -0
- pointblank/locales/data/DK/address.json +231 -0
- pointblank/locales/data/DK/company.json +65 -0
- pointblank/locales/data/DK/internet.json +20 -0
- pointblank/locales/data/DK/misc.json +7 -0
- pointblank/locales/data/DK/person.json +45 -0
- pointblank/locales/data/DK/text.json +43 -0
- pointblank/locales/data/EE/address.json +69 -0
- pointblank/locales/data/EE/company.json +60 -0
- pointblank/locales/data/EE/internet.json +19 -0
- pointblank/locales/data/EE/misc.json +7 -0
- pointblank/locales/data/EE/person.json +39 -0
- pointblank/locales/data/EE/text.json +38 -0
- pointblank/locales/data/ES/address.json +3086 -0
- pointblank/locales/data/ES/company.json +644 -0
- pointblank/locales/data/ES/internet.json +25 -0
- pointblank/locales/data/ES/misc.json +11 -0
- pointblank/locales/data/ES/person.json +488 -0
- pointblank/locales/data/ES/text.json +49 -0
- pointblank/locales/data/FI/address.json +93 -0
- pointblank/locales/data/FI/company.json +65 -0
- pointblank/locales/data/FI/internet.json +20 -0
- pointblank/locales/data/FI/misc.json +8 -0
- pointblank/locales/data/FI/person.json +17 -0
- pointblank/locales/data/FI/text.json +35 -0
- pointblank/locales/data/FR/address.json +619 -0
- pointblank/locales/data/FR/company.json +111 -0
- pointblank/locales/data/FR/internet.json +22 -0
- pointblank/locales/data/FR/misc.json +11 -0
- pointblank/locales/data/FR/person.json +1066 -0
- pointblank/locales/data/FR/text.json +50 -0
- pointblank/locales/data/GB/address.json +5759 -0
- pointblank/locales/data/GB/company.json +131 -0
- pointblank/locales/data/GB/internet.json +24 -0
- pointblank/locales/data/GB/misc.json +45 -0
- pointblank/locales/data/GB/person.json +578 -0
- pointblank/locales/data/GB/text.json +61 -0
- pointblank/locales/data/GR/address.json +68 -0
- pointblank/locales/data/GR/company.json +61 -0
- pointblank/locales/data/GR/internet.json +19 -0
- pointblank/locales/data/GR/misc.json +7 -0
- pointblank/locales/data/GR/person.json +39 -0
- pointblank/locales/data/GR/text.json +38 -0
- pointblank/locales/data/HK/address.json +79 -0
- pointblank/locales/data/HK/company.json +69 -0
- pointblank/locales/data/HK/internet.json +19 -0
- pointblank/locales/data/HK/misc.json +7 -0
- pointblank/locales/data/HK/person.json +42 -0
- pointblank/locales/data/HK/text.json +38 -0
- pointblank/locales/data/HR/address.json +73 -0
- pointblank/locales/data/HR/company.json +60 -0
- pointblank/locales/data/HR/internet.json +19 -0
- pointblank/locales/data/HR/misc.json +7 -0
- pointblank/locales/data/HR/person.json +38 -0
- pointblank/locales/data/HR/text.json +38 -0
- pointblank/locales/data/HU/address.json +70 -0
- pointblank/locales/data/HU/company.json +61 -0
- pointblank/locales/data/HU/internet.json +19 -0
- pointblank/locales/data/HU/misc.json +7 -0
- pointblank/locales/data/HU/person.json +40 -0
- pointblank/locales/data/HU/text.json +38 -0
- pointblank/locales/data/ID/address.json +68 -0
- pointblank/locales/data/ID/company.json +61 -0
- pointblank/locales/data/ID/internet.json +19 -0
- pointblank/locales/data/ID/misc.json +7 -0
- pointblank/locales/data/ID/person.json +40 -0
- pointblank/locales/data/ID/text.json +38 -0
- pointblank/locales/data/IE/address.json +643 -0
- pointblank/locales/data/IE/company.json +140 -0
- pointblank/locales/data/IE/internet.json +24 -0
- pointblank/locales/data/IE/misc.json +44 -0
- pointblank/locales/data/IE/person.json +55 -0
- pointblank/locales/data/IE/text.json +60 -0
- pointblank/locales/data/IN/address.json +92 -0
- pointblank/locales/data/IN/company.json +65 -0
- pointblank/locales/data/IN/internet.json +20 -0
- pointblank/locales/data/IN/misc.json +8 -0
- pointblank/locales/data/IN/person.json +52 -0
- pointblank/locales/data/IN/text.json +39 -0
- pointblank/locales/data/IS/address.json +63 -0
- pointblank/locales/data/IS/company.json +61 -0
- pointblank/locales/data/IS/internet.json +19 -0
- pointblank/locales/data/IS/misc.json +7 -0
- pointblank/locales/data/IS/person.json +44 -0
- pointblank/locales/data/IS/text.json +38 -0
- pointblank/locales/data/IT/address.json +192 -0
- pointblank/locales/data/IT/company.json +137 -0
- pointblank/locales/data/IT/internet.json +20 -0
- pointblank/locales/data/IT/misc.json +10 -0
- pointblank/locales/data/IT/person.json +70 -0
- pointblank/locales/data/IT/text.json +44 -0
- pointblank/locales/data/JP/address.json +713 -0
- pointblank/locales/data/JP/company.json +113 -0
- pointblank/locales/data/JP/internet.json +22 -0
- pointblank/locales/data/JP/misc.json +10 -0
- pointblank/locales/data/JP/person.json +1057 -0
- pointblank/locales/data/JP/text.json +51 -0
- pointblank/locales/data/KR/address.json +77 -0
- pointblank/locales/data/KR/company.json +68 -0
- pointblank/locales/data/KR/internet.json +19 -0
- pointblank/locales/data/KR/misc.json +7 -0
- pointblank/locales/data/KR/person.json +40 -0
- pointblank/locales/data/KR/text.json +38 -0
- pointblank/locales/data/LT/address.json +66 -0
- pointblank/locales/data/LT/company.json +60 -0
- pointblank/locales/data/LT/internet.json +19 -0
- pointblank/locales/data/LT/misc.json +7 -0
- pointblank/locales/data/LT/person.json +42 -0
- pointblank/locales/data/LT/text.json +38 -0
- pointblank/locales/data/LU/address.json +66 -0
- pointblank/locales/data/LU/company.json +60 -0
- pointblank/locales/data/LU/internet.json +19 -0
- pointblank/locales/data/LU/misc.json +7 -0
- pointblank/locales/data/LU/person.json +38 -0
- pointblank/locales/data/LU/text.json +38 -0
- pointblank/locales/data/LV/address.json +62 -0
- pointblank/locales/data/LV/company.json +60 -0
- pointblank/locales/data/LV/internet.json +19 -0
- pointblank/locales/data/LV/misc.json +7 -0
- pointblank/locales/data/LV/person.json +40 -0
- pointblank/locales/data/LV/text.json +38 -0
- pointblank/locales/data/MT/address.json +61 -0
- pointblank/locales/data/MT/company.json +60 -0
- pointblank/locales/data/MT/internet.json +19 -0
- pointblank/locales/data/MT/misc.json +7 -0
- pointblank/locales/data/MT/person.json +38 -0
- pointblank/locales/data/MT/text.json +38 -0
- pointblank/locales/data/MX/address.json +100 -0
- pointblank/locales/data/MX/company.json +65 -0
- pointblank/locales/data/MX/internet.json +20 -0
- pointblank/locales/data/MX/misc.json +8 -0
- pointblank/locales/data/MX/person.json +18 -0
- pointblank/locales/data/MX/text.json +39 -0
- pointblank/locales/data/NL/address.json +1517 -0
- pointblank/locales/data/NL/company.json +133 -0
- pointblank/locales/data/NL/internet.json +44 -0
- pointblank/locales/data/NL/misc.json +55 -0
- pointblank/locales/data/NL/person.json +365 -0
- pointblank/locales/data/NL/text.json +210 -0
- pointblank/locales/data/NO/address.json +86 -0
- pointblank/locales/data/NO/company.json +66 -0
- pointblank/locales/data/NO/internet.json +20 -0
- pointblank/locales/data/NO/misc.json +8 -0
- pointblank/locales/data/NO/person.json +17 -0
- pointblank/locales/data/NO/text.json +35 -0
- pointblank/locales/data/NZ/address.json +90 -0
- pointblank/locales/data/NZ/company.json +65 -0
- pointblank/locales/data/NZ/internet.json +20 -0
- pointblank/locales/data/NZ/misc.json +8 -0
- pointblank/locales/data/NZ/person.json +17 -0
- pointblank/locales/data/NZ/text.json +39 -0
- pointblank/locales/data/PH/address.json +67 -0
- pointblank/locales/data/PH/company.json +61 -0
- pointblank/locales/data/PH/internet.json +19 -0
- pointblank/locales/data/PH/misc.json +7 -0
- pointblank/locales/data/PH/person.json +40 -0
- pointblank/locales/data/PH/text.json +38 -0
- pointblank/locales/data/PL/address.json +91 -0
- pointblank/locales/data/PL/company.json +65 -0
- pointblank/locales/data/PL/internet.json +20 -0
- pointblank/locales/data/PL/misc.json +8 -0
- pointblank/locales/data/PL/person.json +17 -0
- pointblank/locales/data/PL/text.json +35 -0
- pointblank/locales/data/PT/address.json +90 -0
- pointblank/locales/data/PT/company.json +65 -0
- pointblank/locales/data/PT/internet.json +20 -0
- pointblank/locales/data/PT/misc.json +8 -0
- pointblank/locales/data/PT/person.json +17 -0
- pointblank/locales/data/PT/text.json +35 -0
- pointblank/locales/data/RO/address.json +73 -0
- pointblank/locales/data/RO/company.json +61 -0
- pointblank/locales/data/RO/internet.json +19 -0
- pointblank/locales/data/RO/misc.json +7 -0
- pointblank/locales/data/RO/person.json +40 -0
- pointblank/locales/data/RO/text.json +38 -0
- pointblank/locales/data/RU/address.json +74 -0
- pointblank/locales/data/RU/company.json +60 -0
- pointblank/locales/data/RU/internet.json +19 -0
- pointblank/locales/data/RU/misc.json +7 -0
- pointblank/locales/data/RU/person.json +38 -0
- pointblank/locales/data/RU/text.json +38 -0
- pointblank/locales/data/SE/address.json +247 -0
- pointblank/locales/data/SE/company.json +65 -0
- pointblank/locales/data/SE/internet.json +20 -0
- pointblank/locales/data/SE/misc.json +7 -0
- pointblank/locales/data/SE/person.json +45 -0
- pointblank/locales/data/SE/text.json +43 -0
- pointblank/locales/data/SI/address.json +67 -0
- pointblank/locales/data/SI/company.json +60 -0
- pointblank/locales/data/SI/internet.json +19 -0
- pointblank/locales/data/SI/misc.json +7 -0
- pointblank/locales/data/SI/person.json +38 -0
- pointblank/locales/data/SI/text.json +38 -0
- pointblank/locales/data/SK/address.json +64 -0
- pointblank/locales/data/SK/company.json +60 -0
- pointblank/locales/data/SK/internet.json +19 -0
- pointblank/locales/data/SK/misc.json +7 -0
- pointblank/locales/data/SK/person.json +38 -0
- pointblank/locales/data/SK/text.json +38 -0
- pointblank/locales/data/TR/address.json +105 -0
- pointblank/locales/data/TR/company.json +65 -0
- pointblank/locales/data/TR/internet.json +20 -0
- pointblank/locales/data/TR/misc.json +8 -0
- pointblank/locales/data/TR/person.json +17 -0
- pointblank/locales/data/TR/text.json +35 -0
- pointblank/locales/data/TW/address.json +86 -0
- pointblank/locales/data/TW/company.json +69 -0
- pointblank/locales/data/TW/internet.json +19 -0
- pointblank/locales/data/TW/misc.json +7 -0
- pointblank/locales/data/TW/person.json +42 -0
- pointblank/locales/data/TW/text.json +38 -0
- pointblank/locales/data/US/address.json +996 -0
- pointblank/locales/data/US/company.json +131 -0
- pointblank/locales/data/US/internet.json +22 -0
- pointblank/locales/data/US/misc.json +11 -0
- pointblank/locales/data/US/person.json +1092 -0
- pointblank/locales/data/US/text.json +56 -0
- pointblank/locales/data/_shared/misc.json +42 -0
- pointblank/schema.py +339 -2
- pointblank/validate.py +1263 -11
- {pointblank-0.18.0.dist-info → pointblank-0.20.0.dist-info}/METADATA +45 -1
- pointblank-0.20.0.dist-info/RECORD +366 -0
- {pointblank-0.18.0.dist-info → pointblank-0.20.0.dist-info}/WHEEL +1 -1
- pointblank-0.18.0.dist-info/RECORD +0 -59
- {pointblank-0.18.0.dist-info → pointblank-0.20.0.dist-info}/entry_points.txt +0 -0
- {pointblank-0.18.0.dist-info → pointblank-0.20.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.18.0.dist-info → pointblank-0.20.0.dist-info}/top_level.txt +0 -0
pointblank/validate.py
CHANGED
|
@@ -17,6 +17,7 @@ from importlib.metadata import version
|
|
|
17
17
|
from pathlib import Path
|
|
18
18
|
from typing import TYPE_CHECKING, Any, Callable, Literal, NoReturn, ParamSpec, TypeVar
|
|
19
19
|
from zipfile import ZipFile
|
|
20
|
+
from zoneinfo import ZoneInfo
|
|
20
21
|
|
|
21
22
|
import commonmark
|
|
22
23
|
import narwhals as nw
|
|
@@ -4350,6 +4351,18 @@ class Validate:
|
|
|
4350
4351
|
locale's rules. Examples include `"en-US"` for English (United States) and `"fr-FR"` for
|
|
4351
4352
|
French (France). More simply, this can be a language identifier without a designation of
|
|
4352
4353
|
territory, like `"es"` for Spanish.
|
|
4354
|
+
owner
|
|
4355
|
+
An optional string identifying the owner of the data being validated. This is useful for
|
|
4356
|
+
governance purposes, indicating who is responsible for the quality and maintenance of the
|
|
4357
|
+
data. For example, `"data-platform-team"` or `"analytics-engineering"`.
|
|
4358
|
+
consumers
|
|
4359
|
+
An optional string or list of strings identifying who depends on or consumes this data.
|
|
4360
|
+
This helps document data dependencies and can be useful for impact analysis when data
|
|
4361
|
+
quality issues are detected. For example, `"ml-team"` or `["ml-team", "analytics"]`.
|
|
4362
|
+
version
|
|
4363
|
+
An optional string representing the version of the validation plan or data contract. This
|
|
4364
|
+
supports semantic versioning (e.g., `"1.0.0"`, `"2.1.0"`) and is useful for tracking changes
|
|
4365
|
+
to validation rules over time and for organizational governance.
|
|
4353
4366
|
|
|
4354
4367
|
Returns
|
|
4355
4368
|
-------
|
|
@@ -4836,6 +4849,9 @@ class Validate:
|
|
|
4836
4849
|
brief: str | bool | None = None
|
|
4837
4850
|
lang: str | None = None
|
|
4838
4851
|
locale: str | None = None
|
|
4852
|
+
owner: str | None = None
|
|
4853
|
+
consumers: str | list[str] | None = None
|
|
4854
|
+
version: str | None = None
|
|
4839
4855
|
|
|
4840
4856
|
def __post_init__(self):
|
|
4841
4857
|
# Process data through the centralized data processing pipeline
|
|
@@ -4880,6 +4896,36 @@ class Validate:
|
|
|
4880
4896
|
# Transform any shorthands of `brief` to string representations
|
|
4881
4897
|
self.brief = _transform_auto_brief(brief=self.brief)
|
|
4882
4898
|
|
|
4899
|
+
# Validate and normalize the `owner` parameter
|
|
4900
|
+
if self.owner is not None and not isinstance(self.owner, str):
|
|
4901
|
+
raise TypeError(
|
|
4902
|
+
"The `owner=` parameter must be a string representing the owner of the data. "
|
|
4903
|
+
f"Received type: {type(self.owner).__name__}"
|
|
4904
|
+
)
|
|
4905
|
+
|
|
4906
|
+
# Validate and normalize the `consumers` parameter
|
|
4907
|
+
if self.consumers is not None:
|
|
4908
|
+
if isinstance(self.consumers, str):
|
|
4909
|
+
self.consumers = [self.consumers]
|
|
4910
|
+
elif isinstance(self.consumers, list):
|
|
4911
|
+
if not all(isinstance(c, str) for c in self.consumers):
|
|
4912
|
+
raise TypeError(
|
|
4913
|
+
"The `consumers=` parameter must be a string or a list of strings. "
|
|
4914
|
+
"All elements in the list must be strings."
|
|
4915
|
+
)
|
|
4916
|
+
else:
|
|
4917
|
+
raise TypeError(
|
|
4918
|
+
"The `consumers=` parameter must be a string or a list of strings. "
|
|
4919
|
+
f"Received type: {type(self.consumers).__name__}"
|
|
4920
|
+
)
|
|
4921
|
+
|
|
4922
|
+
# Validate the `version` parameter
|
|
4923
|
+
if self.version is not None and not isinstance(self.version, str):
|
|
4924
|
+
raise TypeError(
|
|
4925
|
+
"The `version=` parameter must be a string representing the version. "
|
|
4926
|
+
f"Received type: {type(self.version).__name__}"
|
|
4927
|
+
)
|
|
4928
|
+
|
|
4883
4929
|
# TODO: Add functionality to obtain the column names and types from the table
|
|
4884
4930
|
self.col_names = None
|
|
4885
4931
|
self.col_types = None
|
|
@@ -11530,6 +11576,369 @@ class Validate:
|
|
|
11530
11576
|
|
|
11531
11577
|
return self
|
|
11532
11578
|
|
|
11579
|
+
def data_freshness(
|
|
11580
|
+
self,
|
|
11581
|
+
column: str,
|
|
11582
|
+
max_age: str | datetime.timedelta,
|
|
11583
|
+
reference_time: datetime.datetime | str | None = None,
|
|
11584
|
+
timezone: str | None = None,
|
|
11585
|
+
allow_tz_mismatch: bool = False,
|
|
11586
|
+
pre: Callable | None = None,
|
|
11587
|
+
thresholds: int | float | bool | tuple | dict | Thresholds | None = None,
|
|
11588
|
+
actions: Actions | None = None,
|
|
11589
|
+
brief: str | bool | None = None,
|
|
11590
|
+
active: bool = True,
|
|
11591
|
+
) -> Validate:
|
|
11592
|
+
"""
|
|
11593
|
+
Validate that data in a datetime column is not older than a specified maximum age.
|
|
11594
|
+
|
|
11595
|
+
The `data_freshness()` validation method checks whether the most recent timestamp in the
|
|
11596
|
+
specified datetime column is within the allowed `max_age=` from the `reference_time=` (which
|
|
11597
|
+
defaults to the current time). This is useful for ensuring data pipelines are delivering
|
|
11598
|
+
fresh data and for enforcing data SLAs.
|
|
11599
|
+
|
|
11600
|
+
This method helps detect stale data by comparing the maximum (most recent) value in a
|
|
11601
|
+
datetime column against an expected freshness threshold.
|
|
11602
|
+
|
|
11603
|
+
Parameters
|
|
11604
|
+
----------
|
|
11605
|
+
column
|
|
11606
|
+
The name of the datetime column to check for freshness. This column should contain
|
|
11607
|
+
date or datetime values.
|
|
11608
|
+
max_age
|
|
11609
|
+
The maximum allowed age of the data. Can be specified as: (1) a string with a
|
|
11610
|
+
human-readable duration like `"24 hours"`, `"1 day"`, `"30 minutes"`, `"2 weeks"`, etc.
|
|
11611
|
+
(supported units: `seconds`, `minutes`, `hours`, `days`, `weeks`), or (2) a
|
|
11612
|
+
`datetime.timedelta` object for precise control.
|
|
11613
|
+
reference_time
|
|
11614
|
+
The reference point in time to compare against. Defaults to `None`, which uses the
|
|
11615
|
+
current time (UTC if `timezone=` is not specified). Can be: (1) a `datetime.datetime`
|
|
11616
|
+
object (timezone-aware recommended), (2) a string in ISO 8601 format (e.g.,
|
|
11617
|
+
`"2024-01-15T10:30:00"` or `"2024-01-15T10:30:00+05:30"`), or (3) `None` to use the
|
|
11618
|
+
current time.
|
|
11619
|
+
timezone
|
|
11620
|
+
The timezone to use for interpreting the data and reference time. Accepts IANA
|
|
11621
|
+
timezone names (e.g., `"America/New_York"`), hour offsets (e.g., `"-7"`), or ISO 8601
|
|
11622
|
+
offsets (e.g., `"-07:00"`). When `None` (default), naive datetimes are treated as UTC.
|
|
11623
|
+
See the *The `timezone=` Parameter* section for details.
|
|
11624
|
+
allow_tz_mismatch
|
|
11625
|
+
Whether to allow timezone mismatches between the column data and reference time.
|
|
11626
|
+
By default (`False`), a warning note is added when comparing timezone-naive with
|
|
11627
|
+
timezone-aware datetimes. Set to `True` to suppress these warnings.
|
|
11628
|
+
pre
|
|
11629
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
11630
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
11631
|
+
thresholds
|
|
11632
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
11633
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
11634
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
11635
|
+
be set locally and global thresholds (if any) will take effect.
|
|
11636
|
+
actions
|
|
11637
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
11638
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
11639
|
+
define the actions.
|
|
11640
|
+
brief
|
|
11641
|
+
An optional brief description of the validation step that will be displayed in the
|
|
11642
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
11643
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
11644
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
11645
|
+
won't be a brief.
|
|
11646
|
+
active
|
|
11647
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
11648
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
11649
|
+
for the steps unchanged).
|
|
11650
|
+
|
|
11651
|
+
Returns
|
|
11652
|
+
-------
|
|
11653
|
+
Validate
|
|
11654
|
+
The `Validate` object with the added validation step.
|
|
11655
|
+
|
|
11656
|
+
How Timezones Affect Freshness Checks
|
|
11657
|
+
-------------------------------------
|
|
11658
|
+
Freshness validation involves comparing two times: the **data time** (the most recent
|
|
11659
|
+
timestamp in your column) and the **execution time** (when and where the validation runs).
|
|
11660
|
+
Timezone confusion typically arises because these two times may originate from different
|
|
11661
|
+
contexts.
|
|
11662
|
+
|
|
11663
|
+
Consider these common scenarios:
|
|
11664
|
+
|
|
11665
|
+
- your data timestamps are stored in UTC (common for databases), but you're running
|
|
11666
|
+
validation on your laptop in New York (Eastern Time)
|
|
11667
|
+
- you develop and test validation locally, then deploy it to a cloud workflow that runs
|
|
11668
|
+
in UTC—suddenly your 'same' validation behaves differently
|
|
11669
|
+
- your data comes from servers in multiple regions, each recording timestamps in their
|
|
11670
|
+
local timezone
|
|
11671
|
+
|
|
11672
|
+
The `timezone=` parameter exists to solve this problem by establishing a single, explicit
|
|
11673
|
+
timezone context for the freshness comparison. When you specify a timezone, Pointblank
|
|
11674
|
+
interprets both the data timestamps (if naive) and the execution time in that timezone,
|
|
11675
|
+
ensuring consistent behavior whether you run validation on your laptop or in a cloud
|
|
11676
|
+
workflow.
|
|
11677
|
+
|
|
11678
|
+
**Scenario 1: Data has timezone-aware datetimes**
|
|
11679
|
+
|
|
11680
|
+
```python
|
|
11681
|
+
# Your data column has values like: 2024-01-15 10:30:00+00:00 (UTC)
|
|
11682
|
+
# Comparison is straightforward as both sides have explicit timezones
|
|
11683
|
+
.data_freshness(column="updated_at", max_age="24 hours")
|
|
11684
|
+
```
|
|
11685
|
+
|
|
11686
|
+
**Scenario 2: Data has naive datetimes (no timezone)**
|
|
11687
|
+
|
|
11688
|
+
```python
|
|
11689
|
+
# Your data column has values like: 2024-01-15 10:30:00 (no timezone)
|
|
11690
|
+
# Specify the timezone the data was recorded in:
|
|
11691
|
+
.data_freshness(column="updated_at", max_age="24 hours", timezone="America/New_York")
|
|
11692
|
+
```
|
|
11693
|
+
|
|
11694
|
+
**Scenario 3: Ensuring consistent behavior across environments**
|
|
11695
|
+
|
|
11696
|
+
```python
|
|
11697
|
+
# Pin the timezone to ensure identical results whether running locally or in the cloud
|
|
11698
|
+
.data_freshness(
|
|
11699
|
+
column="updated_at",
|
|
11700
|
+
max_age="24 hours",
|
|
11701
|
+
timezone="UTC", # Explicit timezone removes environment dependence
|
|
11702
|
+
)
|
|
11703
|
+
```
|
|
11704
|
+
|
|
11705
|
+
The `timezone=` Parameter
|
|
11706
|
+
---------------------------
|
|
11707
|
+
The `timezone=` parameter accepts several convenient formats, making it easy to specify
|
|
11708
|
+
timezones in whatever way is most natural for your use case. The following examples
|
|
11709
|
+
illustrate the three supported input styles.
|
|
11710
|
+
|
|
11711
|
+
**IANA Timezone Names** (recommended for regions with daylight saving time):
|
|
11712
|
+
|
|
11713
|
+
```python
|
|
11714
|
+
timezone="America/New_York" # Eastern Time (handles DST automatically)
|
|
11715
|
+
timezone="Europe/London" # UK time
|
|
11716
|
+
timezone="Asia/Tokyo" # Japan Standard Time
|
|
11717
|
+
timezone="Australia/Sydney" # Australian Eastern Time
|
|
11718
|
+
timezone="UTC" # Coordinated Universal Time
|
|
11719
|
+
```
|
|
11720
|
+
|
|
11721
|
+
**Simple Hour Offsets** (quick and easy):
|
|
11722
|
+
|
|
11723
|
+
```python
|
|
11724
|
+
timezone="-7" # UTC-7 (e.g., Mountain Standard Time)
|
|
11725
|
+
timezone="+5" # UTC+5 (e.g., Pakistan Standard Time)
|
|
11726
|
+
timezone="0" # UTC
|
|
11727
|
+
timezone="-12" # UTC-12
|
|
11728
|
+
```
|
|
11729
|
+
|
|
11730
|
+
**ISO 8601 Offset Format** (precise, including fractional hours):
|
|
11731
|
+
|
|
11732
|
+
```python
|
|
11733
|
+
timezone="-07:00" # UTC-7
|
|
11734
|
+
timezone="+05:30" # UTC+5:30 (e.g., India Standard Time)
|
|
11735
|
+
timezone="+00:00" # UTC
|
|
11736
|
+
timezone="-09:30" # UTC-9:30
|
|
11737
|
+
```
|
|
11738
|
+
|
|
11739
|
+
When a timezone is specified:
|
|
11740
|
+
|
|
11741
|
+
- naive datetime values in the column are assumed to be in this timezone.
|
|
11742
|
+
- the reference time (if naive) is assumed to be in this timezone.
|
|
11743
|
+
- the validation report will show times in this timezone.
|
|
11744
|
+
|
|
11745
|
+
When `None` (default):
|
|
11746
|
+
|
|
11747
|
+
- if your column has timezone-aware datetimes, those timezones are used
|
|
11748
|
+
- if your column has naive datetimes, they're treated as UTC
|
|
11749
|
+
- the current time reference uses UTC
|
|
11750
|
+
|
|
11751
|
+
Note that IANA timezone names are preferred when daylight saving time transitions matter, as
|
|
11752
|
+
they automatically handle the offset changes. Fixed offsets like `"-7"` or `"-07:00"` do not
|
|
11753
|
+
account for DST.
|
|
11754
|
+
|
|
11755
|
+
Recommendations for Working with Timestamps
|
|
11756
|
+
-------------------------------------------
|
|
11757
|
+
When working with datetime data, storing timestamps in UTC in your databases is strongly
|
|
11758
|
+
recommended since it provides a consistent reference point regardless of where your data
|
|
11759
|
+
originates or where it's consumed. Using timezone-aware datetimes whenever possible helps
|
|
11760
|
+
avoid ambiguity—when a datetime has an explicit timezone, there's no guessing about what
|
|
11761
|
+
time it actually represents.
|
|
11762
|
+
|
|
11763
|
+
If you're working with naive datetimes (which lack timezone information), always specify the
|
|
11764
|
+
`timezone=` parameter so Pointblank knows how to interpret those values. When providing
|
|
11765
|
+
`reference_time=` as a string, use ISO 8601 format with the timezone offset included (e.g.,
|
|
11766
|
+
`"2024-01-15T10:30:00+00:00"`) to ensure unambiguous parsing. Finally, prefer IANA timezone
|
|
11767
|
+
names (like `"America/New_York"`) over fixed offsets (like `"-05:00"`) when daylight saving
|
|
11768
|
+
time transitions matter, since IANA names automatically handle the twice-yearly offset
|
|
11769
|
+
changes. To see all available IANA timezone names in Python, use
|
|
11770
|
+
`zoneinfo.available_timezones()` from the standard library's `zoneinfo` module.
|
|
11771
|
+
|
|
11772
|
+
Examples
|
|
11773
|
+
--------
|
|
11774
|
+
```{python}
|
|
11775
|
+
#| echo: false
|
|
11776
|
+
#| output: false
|
|
11777
|
+
import pointblank as pb
|
|
11778
|
+
pb.config(report_incl_header=False, report_incl_footer=False)
|
|
11779
|
+
```
|
|
11780
|
+
|
|
11781
|
+
The simplest use of `data_freshness()` requires just two arguments: the `column=` containing
|
|
11782
|
+
your timestamps and `max_age=` specifying how old the data can be. In this first example,
|
|
11783
|
+
we create sample data with an `"updated_at"` column containing timestamps from 1, 12, and
|
|
11784
|
+
20 hours ago. By setting `max_age="24 hours"`, we're asserting that the most recent
|
|
11785
|
+
timestamp should be within 24 hours of the current time. Since the newest record is only
|
|
11786
|
+
1 hour old, this validation passes.
|
|
11787
|
+
|
|
11788
|
+
```{python}
|
|
11789
|
+
import pointblank as pb
|
|
11790
|
+
import polars as pl
|
|
11791
|
+
from datetime import datetime, timedelta
|
|
11792
|
+
|
|
11793
|
+
# Create sample data with recent timestamps
|
|
11794
|
+
recent_data = pl.DataFrame({
|
|
11795
|
+
"id": [1, 2, 3],
|
|
11796
|
+
"updated_at": [
|
|
11797
|
+
datetime.now() - timedelta(hours=1),
|
|
11798
|
+
datetime.now() - timedelta(hours=12),
|
|
11799
|
+
datetime.now() - timedelta(hours=20),
|
|
11800
|
+
]
|
|
11801
|
+
})
|
|
11802
|
+
|
|
11803
|
+
validation = (
|
|
11804
|
+
pb.Validate(data=recent_data)
|
|
11805
|
+
.data_freshness(column="updated_at", max_age="24 hours")
|
|
11806
|
+
.interrogate()
|
|
11807
|
+
)
|
|
11808
|
+
|
|
11809
|
+
validation
|
|
11810
|
+
```
|
|
11811
|
+
|
|
11812
|
+
The `max_age=` parameter accepts human-readable strings with various time units. You can
|
|
11813
|
+
chain multiple `data_freshness()` calls to check different freshness thresholds
|
|
11814
|
+
simultaneously—useful for tiered SLAs where you might want warnings at 30 minutes but
|
|
11815
|
+
errors at 2 days.
|
|
11816
|
+
|
|
11817
|
+
```{python}
|
|
11818
|
+
# Check data is fresh within different time windows
|
|
11819
|
+
validation = (
|
|
11820
|
+
pb.Validate(data=recent_data)
|
|
11821
|
+
.data_freshness(column="updated_at", max_age="30 minutes") # Very fresh
|
|
11822
|
+
.data_freshness(column="updated_at", max_age="2 days") # Reasonably fresh
|
|
11823
|
+
.data_freshness(column="updated_at", max_age="1 week") # Within a week
|
|
11824
|
+
.interrogate()
|
|
11825
|
+
)
|
|
11826
|
+
|
|
11827
|
+
validation
|
|
11828
|
+
```
|
|
11829
|
+
|
|
11830
|
+
When your data contains naive datetimes (timestamps without timezone information), use the
|
|
11831
|
+
`timezone=` parameter to specify what timezone those values represent. Here we have event
|
|
11832
|
+
data recorded in Eastern Time, so we set `timezone="America/New_York"` to ensure the
|
|
11833
|
+
freshness comparison is done correctly.
|
|
11834
|
+
|
|
11835
|
+
```{python}
|
|
11836
|
+
# Data with naive datetimes (assume they're in Eastern Time)
|
|
11837
|
+
eastern_data = pl.DataFrame({
|
|
11838
|
+
"event_time": [
|
|
11839
|
+
datetime.now() - timedelta(hours=2),
|
|
11840
|
+
datetime.now() - timedelta(hours=5),
|
|
11841
|
+
]
|
|
11842
|
+
})
|
|
11843
|
+
|
|
11844
|
+
validation = (
|
|
11845
|
+
pb.Validate(data=eastern_data)
|
|
11846
|
+
.data_freshness(
|
|
11847
|
+
column="event_time",
|
|
11848
|
+
max_age="12 hours",
|
|
11849
|
+
timezone="America/New_York" # Interpret times as Eastern
|
|
11850
|
+
)
|
|
11851
|
+
.interrogate()
|
|
11852
|
+
)
|
|
11853
|
+
|
|
11854
|
+
validation
|
|
11855
|
+
```
|
|
11856
|
+
|
|
11857
|
+
For reproducible validations or historical checks, you can use `reference_time=` to compare
|
|
11858
|
+
against a specific point in time instead of the current time. This is particularly useful
|
|
11859
|
+
for testing or when validating data snapshots. The reference time should include a timezone
|
|
11860
|
+
offset (like `+00:00` for UTC) to avoid ambiguity.
|
|
11861
|
+
|
|
11862
|
+
```{python}
|
|
11863
|
+
validation = (
|
|
11864
|
+
pb.Validate(data=recent_data)
|
|
11865
|
+
.data_freshness(
|
|
11866
|
+
column="updated_at",
|
|
11867
|
+
max_age="24 hours",
|
|
11868
|
+
reference_time="2024-01-15T12:00:00+00:00"
|
|
11869
|
+
)
|
|
11870
|
+
.interrogate()
|
|
11871
|
+
)
|
|
11872
|
+
|
|
11873
|
+
validation
|
|
11874
|
+
```
|
|
11875
|
+
"""
|
|
11876
|
+
|
|
11877
|
+
assertion_type = _get_fn_name()
|
|
11878
|
+
|
|
11879
|
+
_check_pre(pre=pre)
|
|
11880
|
+
_check_thresholds(thresholds=thresholds)
|
|
11881
|
+
_check_boolean_input(param=active, param_name="active")
|
|
11882
|
+
_check_boolean_input(param=allow_tz_mismatch, param_name="allow_tz_mismatch")
|
|
11883
|
+
|
|
11884
|
+
# Validate and parse the max_age parameter
|
|
11885
|
+
max_age_td = _parse_max_age(max_age)
|
|
11886
|
+
|
|
11887
|
+
# Validate the column parameter
|
|
11888
|
+
if not isinstance(column, str):
|
|
11889
|
+
raise TypeError(
|
|
11890
|
+
f"The `column` parameter must be a string, got {type(column).__name__}."
|
|
11891
|
+
)
|
|
11892
|
+
|
|
11893
|
+
# Validate the timezone parameter if provided
|
|
11894
|
+
if timezone is not None:
|
|
11895
|
+
_validate_timezone(timezone)
|
|
11896
|
+
|
|
11897
|
+
# Parse reference_time if it's a string
|
|
11898
|
+
parsed_reference_time = None
|
|
11899
|
+
if reference_time is not None:
|
|
11900
|
+
if isinstance(reference_time, str):
|
|
11901
|
+
parsed_reference_time = _parse_reference_time(reference_time)
|
|
11902
|
+
elif isinstance(reference_time, datetime.datetime):
|
|
11903
|
+
parsed_reference_time = reference_time
|
|
11904
|
+
else:
|
|
11905
|
+
raise TypeError(
|
|
11906
|
+
f"The `reference_time` parameter must be a string or datetime object, "
|
|
11907
|
+
f"got {type(reference_time).__name__}."
|
|
11908
|
+
)
|
|
11909
|
+
|
|
11910
|
+
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
11911
|
+
thresholds = (
|
|
11912
|
+
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
11913
|
+
)
|
|
11914
|
+
|
|
11915
|
+
# Package up the parameters for later interrogation
|
|
11916
|
+
values = {
|
|
11917
|
+
"max_age": max_age_td,
|
|
11918
|
+
"max_age_str": max_age if isinstance(max_age, str) else str(max_age),
|
|
11919
|
+
"reference_time": parsed_reference_time,
|
|
11920
|
+
"timezone": timezone,
|
|
11921
|
+
"allow_tz_mismatch": allow_tz_mismatch,
|
|
11922
|
+
}
|
|
11923
|
+
|
|
11924
|
+
# Determine brief to use (global or local) and transform any shorthands of `brief=`
|
|
11925
|
+
brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
|
|
11926
|
+
|
|
11927
|
+
val_info = _ValidationInfo(
|
|
11928
|
+
assertion_type=assertion_type,
|
|
11929
|
+
column=column,
|
|
11930
|
+
values=values,
|
|
11931
|
+
pre=pre,
|
|
11932
|
+
thresholds=thresholds,
|
|
11933
|
+
actions=actions,
|
|
11934
|
+
brief=brief,
|
|
11935
|
+
active=active,
|
|
11936
|
+
)
|
|
11937
|
+
|
|
11938
|
+
self._add_validation(validation_info=val_info)
|
|
11939
|
+
|
|
11940
|
+
return self
|
|
11941
|
+
|
|
11533
11942
|
def col_count_match(
|
|
11534
11943
|
self,
|
|
11535
11944
|
count: int | Any,
|
|
@@ -12941,6 +13350,8 @@ class Validate:
|
|
|
12941
13350
|
"col_schema_match",
|
|
12942
13351
|
"row_count_match",
|
|
12943
13352
|
"col_count_match",
|
|
13353
|
+
"data_freshness",
|
|
13354
|
+
"tbl_match",
|
|
12944
13355
|
]
|
|
12945
13356
|
|
|
12946
13357
|
if validation.n == 0 and assertion_type not in table_level_assertions:
|
|
@@ -13201,6 +13612,105 @@ class Validate:
|
|
|
13201
13612
|
|
|
13202
13613
|
results_tbl = None
|
|
13203
13614
|
|
|
13615
|
+
elif assertion_type == "data_freshness":
|
|
13616
|
+
from pointblank._interrogation import data_freshness as data_freshness_check
|
|
13617
|
+
|
|
13618
|
+
freshness_result = data_freshness_check(
|
|
13619
|
+
data_tbl=data_tbl_step,
|
|
13620
|
+
column=column,
|
|
13621
|
+
max_age=value["max_age"],
|
|
13622
|
+
reference_time=value["reference_time"],
|
|
13623
|
+
timezone=value["timezone"],
|
|
13624
|
+
allow_tz_mismatch=value["allow_tz_mismatch"],
|
|
13625
|
+
)
|
|
13626
|
+
|
|
13627
|
+
result_bool = freshness_result["passed"]
|
|
13628
|
+
validation.all_passed = result_bool
|
|
13629
|
+
validation.n = 1
|
|
13630
|
+
validation.n_passed = int(result_bool)
|
|
13631
|
+
validation.n_failed = 1 - int(result_bool)
|
|
13632
|
+
|
|
13633
|
+
# Store the freshness check details for reporting
|
|
13634
|
+
validation.val_info = freshness_result
|
|
13635
|
+
|
|
13636
|
+
# Update the values dict with actual computed values for failure text
|
|
13637
|
+
if freshness_result.get("age") is not None:
|
|
13638
|
+
value["age"] = freshness_result["age"]
|
|
13639
|
+
|
|
13640
|
+
# Add timezone warning note if applicable
|
|
13641
|
+
if freshness_result.get("tz_warning_key"):
|
|
13642
|
+
tz_key = freshness_result["tz_warning_key"]
|
|
13643
|
+
tz_warning_text = NOTES_TEXT.get(tz_key, {}).get(
|
|
13644
|
+
self.locale, NOTES_TEXT.get(tz_key, {}).get("en", "")
|
|
13645
|
+
)
|
|
13646
|
+
validation._add_note(
|
|
13647
|
+
key="tz_warning",
|
|
13648
|
+
markdown=f"⚠️ {tz_warning_text}",
|
|
13649
|
+
text=tz_warning_text,
|
|
13650
|
+
)
|
|
13651
|
+
|
|
13652
|
+
# Add note about column being empty if applicable
|
|
13653
|
+
if freshness_result.get("column_empty"):
|
|
13654
|
+
column_empty_text = NOTES_TEXT.get(
|
|
13655
|
+
"data_freshness_column_empty", {}
|
|
13656
|
+
).get(
|
|
13657
|
+
self.locale,
|
|
13658
|
+
NOTES_TEXT.get("data_freshness_column_empty", {}).get(
|
|
13659
|
+
"en", "The datetime column is empty (no values to check)."
|
|
13660
|
+
),
|
|
13661
|
+
)
|
|
13662
|
+
validation._add_note(
|
|
13663
|
+
key="column_empty",
|
|
13664
|
+
markdown=f"⚠️ {column_empty_text}",
|
|
13665
|
+
text=column_empty_text,
|
|
13666
|
+
)
|
|
13667
|
+
|
|
13668
|
+
# Add informational note about the freshness check
|
|
13669
|
+
if freshness_result.get("max_datetime") and freshness_result.get("age"):
|
|
13670
|
+
max_dt = freshness_result["max_datetime"]
|
|
13671
|
+
# Format datetime without microseconds for cleaner display
|
|
13672
|
+
if hasattr(max_dt, "replace"):
|
|
13673
|
+
max_dt_display = max_dt.replace(microsecond=0)
|
|
13674
|
+
else:
|
|
13675
|
+
max_dt_display = max_dt
|
|
13676
|
+
age = freshness_result["age"]
|
|
13677
|
+
age_str = _format_timedelta(age)
|
|
13678
|
+
max_age_str = _format_timedelta(value["max_age"])
|
|
13679
|
+
|
|
13680
|
+
# Get translated template for pass/fail
|
|
13681
|
+
if result_bool:
|
|
13682
|
+
details_key = "data_freshness_details_pass"
|
|
13683
|
+
prefix = "✓"
|
|
13684
|
+
else:
|
|
13685
|
+
details_key = "data_freshness_details_fail"
|
|
13686
|
+
prefix = "✗"
|
|
13687
|
+
|
|
13688
|
+
details_template = NOTES_TEXT.get(details_key, {}).get(
|
|
13689
|
+
self.locale,
|
|
13690
|
+
NOTES_TEXT.get(details_key, {}).get(
|
|
13691
|
+
"en",
|
|
13692
|
+
"Most recent data: `{max_dt}` (age: {age}, max allowed: {max_age})",
|
|
13693
|
+
),
|
|
13694
|
+
)
|
|
13695
|
+
|
|
13696
|
+
# Format the template with values
|
|
13697
|
+
note_text = details_template.format(
|
|
13698
|
+
max_dt=max_dt_display, age=age_str, max_age=max_age_str
|
|
13699
|
+
)
|
|
13700
|
+
# For markdown, make the age bold
|
|
13701
|
+
note_md_template = details_template.replace(
|
|
13702
|
+
"(age: {age}", "(age: **{age}**"
|
|
13703
|
+
)
|
|
13704
|
+
note_md = f"{prefix} {note_md_template.format(max_dt=max_dt_display, age=age_str, max_age=max_age_str)}"
|
|
13705
|
+
|
|
13706
|
+
validation._add_note(
|
|
13707
|
+
key="freshness_details",
|
|
13708
|
+
markdown=note_md,
|
|
13709
|
+
text=note_text,
|
|
13710
|
+
)
|
|
13711
|
+
|
|
13712
|
+
results_tbl = None
|
|
13713
|
+
|
|
13204
13714
|
elif assertion_type == "tbl_match":
|
|
13205
13715
|
from pointblank._interrogation import tbl_match
|
|
13206
13716
|
|
|
@@ -13265,6 +13775,15 @@ class Validate:
|
|
|
13265
13775
|
validation.n_passed = int(result_bool)
|
|
13266
13776
|
validation.n_failed = 1 - result_bool
|
|
13267
13777
|
|
|
13778
|
+
# Store computed values for step reports
|
|
13779
|
+
validation.val_info = {
|
|
13780
|
+
"actual": real,
|
|
13781
|
+
"target": target,
|
|
13782
|
+
"tol": tol,
|
|
13783
|
+
"lower_bound": lower_bound,
|
|
13784
|
+
"upper_bound": upper_bound,
|
|
13785
|
+
}
|
|
13786
|
+
|
|
13268
13787
|
results_tbl = None
|
|
13269
13788
|
else:
|
|
13270
13789
|
raise ValueError(
|
|
@@ -16045,6 +16564,69 @@ class Validate:
|
|
|
16045
16564
|
tol_value = bound_finder.keywords.get("tol", 0) if bound_finder else 0
|
|
16046
16565
|
values_upd.append(f"p = {p_value}<br/>tol = {tol_value}")
|
|
16047
16566
|
|
|
16567
|
+
elif assertion_type[i] in ["data_freshness"]:
|
|
16568
|
+
# Format max_age nicely for display
|
|
16569
|
+
max_age = value.get("max_age")
|
|
16570
|
+
max_age_str = _format_timedelta(max_age) if max_age else "—"
|
|
16571
|
+
|
|
16572
|
+
# Build additional lines with non-default parameters
|
|
16573
|
+
extra_lines = []
|
|
16574
|
+
|
|
16575
|
+
if value.get("reference_time") is not None:
|
|
16576
|
+
ref_time = value["reference_time"]
|
|
16577
|
+
|
|
16578
|
+
# Format datetime across two lines: date and time+tz
|
|
16579
|
+
if hasattr(ref_time, "strftime"):
|
|
16580
|
+
date_str = ref_time.strftime("@%Y-%m-%d")
|
|
16581
|
+
time_str = " " + ref_time.strftime("%H:%M:%S")
|
|
16582
|
+
|
|
16583
|
+
# Add timezone offset if present
|
|
16584
|
+
if hasattr(ref_time, "tzinfo") and ref_time.tzinfo is not None:
|
|
16585
|
+
tz_offset = ref_time.strftime("%z")
|
|
16586
|
+
if tz_offset:
|
|
16587
|
+
time_str += tz_offset
|
|
16588
|
+
extra_lines.append(date_str)
|
|
16589
|
+
extra_lines.append(time_str)
|
|
16590
|
+
else:
|
|
16591
|
+
extra_lines.append(f"@{ref_time}")
|
|
16592
|
+
|
|
16593
|
+
# Timezone and allow_tz_mismatch on same line
|
|
16594
|
+
tz_line_parts = []
|
|
16595
|
+
if value.get("timezone") is not None:
|
|
16596
|
+
# Convert timezone name to ISO 8601 offset format
|
|
16597
|
+
tz_name = value["timezone"]
|
|
16598
|
+
|
|
16599
|
+
try:
|
|
16600
|
+
tz_obj = ZoneInfo(tz_name)
|
|
16601
|
+
|
|
16602
|
+
# Get the current offset for this timezone
|
|
16603
|
+
now = datetime.datetime.now(tz_obj)
|
|
16604
|
+
offset = now.strftime("%z")
|
|
16605
|
+
|
|
16606
|
+
# Format as ISO 8601 extended: -07:00 (insert colon)
|
|
16607
|
+
if len(offset) == 5:
|
|
16608
|
+
tz_display = f"{offset[:3]}:{offset[3:]}"
|
|
16609
|
+
else:
|
|
16610
|
+
tz_display = offset
|
|
16611
|
+
|
|
16612
|
+
except Exception:
|
|
16613
|
+
tz_display = tz_name
|
|
16614
|
+
tz_line_parts.append(tz_display)
|
|
16615
|
+
|
|
16616
|
+
if value.get("allow_tz_mismatch"):
|
|
16617
|
+
tz_line_parts.append("~tz")
|
|
16618
|
+
|
|
16619
|
+
if tz_line_parts:
|
|
16620
|
+
extra_lines.append(" ".join(tz_line_parts))
|
|
16621
|
+
|
|
16622
|
+
if extra_lines:
|
|
16623
|
+
extra_html = "<br/>".join(extra_lines)
|
|
16624
|
+
values_upd.append(
|
|
16625
|
+
f'{max_age_str}<br/><span style="font-size: 9px;">{extra_html}</span>'
|
|
16626
|
+
)
|
|
16627
|
+
else:
|
|
16628
|
+
values_upd.append(max_age_str)
|
|
16629
|
+
|
|
16048
16630
|
elif assertion_type[i] in ["col_schema_match"]:
|
|
16049
16631
|
values_upd.append("SCHEMA")
|
|
16050
16632
|
|
|
@@ -16550,6 +17132,15 @@ class Validate:
|
|
|
16550
17132
|
if incl_footer_timings:
|
|
16551
17133
|
gt_tbl = gt_tbl.tab_source_note(source_note=html(table_time))
|
|
16552
17134
|
|
|
17135
|
+
# Add governance metadata as source note if any metadata is present
|
|
17136
|
+
governance_html = _create_governance_metadata_html(
|
|
17137
|
+
owner=self.owner,
|
|
17138
|
+
consumers=self.consumers,
|
|
17139
|
+
version=self.version,
|
|
17140
|
+
)
|
|
17141
|
+
if governance_html:
|
|
17142
|
+
gt_tbl = gt_tbl.tab_source_note(source_note=html(governance_html))
|
|
17143
|
+
|
|
16553
17144
|
# Create notes markdown from validation steps and add as separate source note if enabled
|
|
16554
17145
|
if incl_footer_notes:
|
|
16555
17146
|
notes_markdown = _create_notes_html(self.validation_info)
|
|
@@ -16898,6 +17489,18 @@ class Validate:
|
|
|
16898
17489
|
debug_return_df=debug_return_df,
|
|
16899
17490
|
)
|
|
16900
17491
|
|
|
17492
|
+
elif is_valid_agg(assertion_type):
|
|
17493
|
+
step_report = _step_report_aggregate(
|
|
17494
|
+
assertion_type=assertion_type,
|
|
17495
|
+
i=i,
|
|
17496
|
+
column=column,
|
|
17497
|
+
values=values,
|
|
17498
|
+
all_passed=all_passed,
|
|
17499
|
+
val_info=val_info,
|
|
17500
|
+
header=header,
|
|
17501
|
+
lang=lang,
|
|
17502
|
+
)
|
|
17503
|
+
|
|
16901
17504
|
else:
|
|
16902
17505
|
step_report = None # pragma: no cover
|
|
16903
17506
|
|
|
@@ -17494,19 +18097,278 @@ def _process_brief(
|
|
|
17494
18097
|
return brief
|
|
17495
18098
|
|
|
17496
18099
|
|
|
17497
|
-
def
|
|
17498
|
-
|
|
17499
|
-
|
|
17500
|
-
return "{auto}"
|
|
17501
|
-
else:
|
|
17502
|
-
return None
|
|
17503
|
-
else:
|
|
17504
|
-
return brief
|
|
18100
|
+
def _parse_max_age(max_age: str | datetime.timedelta) -> datetime.timedelta:
|
|
18101
|
+
"""
|
|
18102
|
+
Parse a max_age specification into a timedelta.
|
|
17505
18103
|
|
|
18104
|
+
Parameters
|
|
18105
|
+
----------
|
|
18106
|
+
max_age
|
|
18107
|
+
Either a timedelta object or a string like "24 hours", "1 day", "30 minutes",
|
|
18108
|
+
or compound expressions like "2 hours 15 minutes", "1 day 6 hours", etc.
|
|
17506
18109
|
|
|
17507
|
-
|
|
17508
|
-
|
|
17509
|
-
|
|
18110
|
+
Returns
|
|
18111
|
+
-------
|
|
18112
|
+
datetime.timedelta
|
|
18113
|
+
The parsed timedelta.
|
|
18114
|
+
|
|
18115
|
+
Raises
|
|
18116
|
+
------
|
|
18117
|
+
ValueError
|
|
18118
|
+
If the string format is invalid or the unit is not recognized.
|
|
18119
|
+
"""
|
|
18120
|
+
if isinstance(max_age, datetime.timedelta):
|
|
18121
|
+
return max_age
|
|
18122
|
+
|
|
18123
|
+
if not isinstance(max_age, str):
|
|
18124
|
+
raise TypeError(
|
|
18125
|
+
f"The `max_age` parameter must be a string or timedelta, got {type(max_age).__name__}."
|
|
18126
|
+
)
|
|
18127
|
+
|
|
18128
|
+
# Parse string format like "24 hours", "1 day", "30 minutes", etc.
|
|
18129
|
+
max_age_str = max_age.strip().lower()
|
|
18130
|
+
|
|
18131
|
+
# Define unit mappings (singular and plural forms)
|
|
18132
|
+
unit_mappings = {
|
|
18133
|
+
"second": "seconds",
|
|
18134
|
+
"seconds": "seconds",
|
|
18135
|
+
"sec": "seconds",
|
|
18136
|
+
"secs": "seconds",
|
|
18137
|
+
"s": "seconds",
|
|
18138
|
+
"minute": "minutes",
|
|
18139
|
+
"minutes": "minutes",
|
|
18140
|
+
"min": "minutes",
|
|
18141
|
+
"mins": "minutes",
|
|
18142
|
+
"m": "minutes",
|
|
18143
|
+
"hour": "hours",
|
|
18144
|
+
"hours": "hours",
|
|
18145
|
+
"hr": "hours",
|
|
18146
|
+
"hrs": "hours",
|
|
18147
|
+
"h": "hours",
|
|
18148
|
+
"day": "days",
|
|
18149
|
+
"days": "days",
|
|
18150
|
+
"d": "days",
|
|
18151
|
+
"week": "weeks",
|
|
18152
|
+
"weeks": "weeks",
|
|
18153
|
+
"wk": "weeks",
|
|
18154
|
+
"wks": "weeks",
|
|
18155
|
+
"w": "weeks",
|
|
18156
|
+
}
|
|
18157
|
+
|
|
18158
|
+
import re
|
|
18159
|
+
|
|
18160
|
+
# Pattern to find all number+unit pairs (supports compound expressions)
|
|
18161
|
+
# Matches: "2 hours 15 minutes", "1day6h", "30 min", etc.
|
|
18162
|
+
compound_pattern = r"(\d+(?:\.\d+)?)\s*([a-zA-Z]+)"
|
|
18163
|
+
matches = re.findall(compound_pattern, max_age_str)
|
|
18164
|
+
|
|
18165
|
+
if not matches:
|
|
18166
|
+
raise ValueError(
|
|
18167
|
+
f"Invalid max_age format: '{max_age}'. Expected format like '24 hours', "
|
|
18168
|
+
f"'1 day', '30 minutes', '2 hours 15 minutes', etc."
|
|
18169
|
+
)
|
|
18170
|
+
|
|
18171
|
+
# Accumulate timedelta from all matched components
|
|
18172
|
+
total_td = datetime.timedelta()
|
|
18173
|
+
valid_units = ["seconds", "minutes", "hours", "days", "weeks"]
|
|
18174
|
+
|
|
18175
|
+
for value_str, unit in matches:
|
|
18176
|
+
value = float(value_str)
|
|
18177
|
+
|
|
18178
|
+
# Normalize the unit
|
|
18179
|
+
unit_lower = unit.lower()
|
|
18180
|
+
if unit_lower not in unit_mappings:
|
|
18181
|
+
raise ValueError(
|
|
18182
|
+
f"Unknown time unit '{unit}' in max_age '{max_age}'. "
|
|
18183
|
+
f"Valid units are: {', '.join(valid_units)} (or their abbreviations)."
|
|
18184
|
+
)
|
|
18185
|
+
|
|
18186
|
+
normalized_unit = unit_mappings[unit_lower]
|
|
18187
|
+
|
|
18188
|
+
# Add to total timedelta
|
|
18189
|
+
if normalized_unit == "seconds":
|
|
18190
|
+
total_td += datetime.timedelta(seconds=value)
|
|
18191
|
+
elif normalized_unit == "minutes":
|
|
18192
|
+
total_td += datetime.timedelta(minutes=value)
|
|
18193
|
+
elif normalized_unit == "hours":
|
|
18194
|
+
total_td += datetime.timedelta(hours=value)
|
|
18195
|
+
elif normalized_unit == "days":
|
|
18196
|
+
total_td += datetime.timedelta(days=value)
|
|
18197
|
+
elif normalized_unit == "weeks":
|
|
18198
|
+
total_td += datetime.timedelta(weeks=value)
|
|
18199
|
+
|
|
18200
|
+
return total_td
|
|
18201
|
+
|
|
18202
|
+
|
|
18203
|
+
def _parse_timezone(timezone: str) -> datetime.tzinfo:
|
|
18204
|
+
"""
|
|
18205
|
+
Parse a timezone string into a tzinfo object.
|
|
18206
|
+
|
|
18207
|
+
Supports:
|
|
18208
|
+
- IANA timezone names: "America/New_York", "Europe/London", "UTC"
|
|
18209
|
+
- Offset strings: "-7", "+5", "-07:00", "+05:30"
|
|
18210
|
+
|
|
18211
|
+
Parameters
|
|
18212
|
+
----------
|
|
18213
|
+
timezone
|
|
18214
|
+
The timezone string to parse.
|
|
18215
|
+
|
|
18216
|
+
Returns
|
|
18217
|
+
-------
|
|
18218
|
+
datetime.tzinfo
|
|
18219
|
+
The parsed timezone object.
|
|
18220
|
+
|
|
18221
|
+
Raises
|
|
18222
|
+
------
|
|
18223
|
+
ValueError
|
|
18224
|
+
If the timezone is not valid.
|
|
18225
|
+
"""
|
|
18226
|
+
import re
|
|
18227
|
+
|
|
18228
|
+
# Check for offset formats: "-7", "+5", "-07:00", "+05:30", etc.
|
|
18229
|
+
# Match: optional sign, 1-2 digits, optional colon and 2 more digits
|
|
18230
|
+
offset_pattern = r"^([+-]?)(\d{1,2})(?::(\d{2}))?$"
|
|
18231
|
+
match = re.match(offset_pattern, timezone.strip())
|
|
18232
|
+
|
|
18233
|
+
if match:
|
|
18234
|
+
sign_str, hours_str, minutes_str = match.groups()
|
|
18235
|
+
hours = int(hours_str)
|
|
18236
|
+
minutes = int(minutes_str) if minutes_str else 0
|
|
18237
|
+
|
|
18238
|
+
# Apply sign (default positive if not specified)
|
|
18239
|
+
total_minutes = hours * 60 + minutes
|
|
18240
|
+
if sign_str == "-":
|
|
18241
|
+
total_minutes = -total_minutes
|
|
18242
|
+
|
|
18243
|
+
return datetime.timezone(datetime.timedelta(minutes=total_minutes))
|
|
18244
|
+
|
|
18245
|
+
# Try IANA timezone names (zoneinfo is standard in Python 3.9+)
|
|
18246
|
+
try:
|
|
18247
|
+
return ZoneInfo(timezone)
|
|
18248
|
+
except KeyError:
|
|
18249
|
+
pass
|
|
18250
|
+
|
|
18251
|
+
raise ValueError(
|
|
18252
|
+
f"Invalid timezone: '{timezone}'. Use an IANA timezone name "
|
|
18253
|
+
f"(e.g., 'America/New_York', 'UTC') or an offset (e.g., '-7', '+05:30')."
|
|
18254
|
+
)
|
|
18255
|
+
|
|
18256
|
+
|
|
18257
|
+
def _validate_timezone(timezone: str) -> None:
|
|
18258
|
+
"""
|
|
18259
|
+
Validate that a timezone string is valid.
|
|
18260
|
+
|
|
18261
|
+
Parameters
|
|
18262
|
+
----------
|
|
18263
|
+
timezone
|
|
18264
|
+
The timezone string to validate.
|
|
18265
|
+
|
|
18266
|
+
Raises
|
|
18267
|
+
------
|
|
18268
|
+
ValueError
|
|
18269
|
+
If the timezone is not valid.
|
|
18270
|
+
"""
|
|
18271
|
+
# Use _parse_timezone to validate - it will raise ValueError if invalid
|
|
18272
|
+
_parse_timezone(timezone)
|
|
18273
|
+
|
|
18274
|
+
|
|
18275
|
+
def _parse_reference_time(reference_time: str) -> datetime.datetime:
|
|
18276
|
+
"""
|
|
18277
|
+
Parse a reference time string into a datetime object.
|
|
18278
|
+
|
|
18279
|
+
Parameters
|
|
18280
|
+
----------
|
|
18281
|
+
reference_time
|
|
18282
|
+
An ISO 8601 formatted datetime string.
|
|
18283
|
+
|
|
18284
|
+
Returns
|
|
18285
|
+
-------
|
|
18286
|
+
datetime.datetime
|
|
18287
|
+
The parsed datetime object.
|
|
18288
|
+
|
|
18289
|
+
Raises
|
|
18290
|
+
------
|
|
18291
|
+
ValueError
|
|
18292
|
+
If the string cannot be parsed.
|
|
18293
|
+
"""
|
|
18294
|
+
# Try parsing with fromisoformat (handles most ISO 8601 formats)
|
|
18295
|
+
try:
|
|
18296
|
+
return datetime.datetime.fromisoformat(reference_time)
|
|
18297
|
+
except ValueError:
|
|
18298
|
+
pass
|
|
18299
|
+
|
|
18300
|
+
# Try parsing common formats
|
|
18301
|
+
formats = [
|
|
18302
|
+
"%Y-%m-%d %H:%M:%S",
|
|
18303
|
+
"%Y-%m-%d %H:%M:%S%z",
|
|
18304
|
+
"%Y-%m-%dT%H:%M:%S",
|
|
18305
|
+
"%Y-%m-%dT%H:%M:%S%z",
|
|
18306
|
+
"%Y-%m-%d",
|
|
18307
|
+
]
|
|
18308
|
+
|
|
18309
|
+
for fmt in formats:
|
|
18310
|
+
try:
|
|
18311
|
+
return datetime.datetime.strptime(reference_time, fmt)
|
|
18312
|
+
except ValueError:
|
|
18313
|
+
continue
|
|
18314
|
+
|
|
18315
|
+
raise ValueError(
|
|
18316
|
+
f"Could not parse reference_time '{reference_time}'. "
|
|
18317
|
+
f"Please use ISO 8601 format like '2024-01-15T10:30:00' or '2024-01-15T10:30:00+00:00'."
|
|
18318
|
+
)
|
|
18319
|
+
|
|
18320
|
+
|
|
18321
|
+
def _format_timedelta(td: datetime.timedelta) -> str:
|
|
18322
|
+
"""
|
|
18323
|
+
Format a timedelta into a human-readable string.
|
|
18324
|
+
|
|
18325
|
+
Parameters
|
|
18326
|
+
----------
|
|
18327
|
+
td
|
|
18328
|
+
The timedelta to format.
|
|
18329
|
+
|
|
18330
|
+
Returns
|
|
18331
|
+
-------
|
|
18332
|
+
str
|
|
18333
|
+
A human-readable string like "24 hours", "2 days 5 hours", etc.
|
|
18334
|
+
"""
|
|
18335
|
+
total_seconds = td.total_seconds()
|
|
18336
|
+
|
|
18337
|
+
if total_seconds < 60:
|
|
18338
|
+
val = round(total_seconds, 1)
|
|
18339
|
+
return f"{val}s"
|
|
18340
|
+
elif total_seconds < 3600:
|
|
18341
|
+
val = round(total_seconds / 60, 1)
|
|
18342
|
+
return f"{val}m"
|
|
18343
|
+
elif total_seconds < 86400:
|
|
18344
|
+
val = round(total_seconds / 3600, 1)
|
|
18345
|
+
return f"{val}h"
|
|
18346
|
+
elif total_seconds < 604800:
|
|
18347
|
+
# For days, show "xd yh" format for better readability
|
|
18348
|
+
days = int(total_seconds // 86400)
|
|
18349
|
+
remaining_hours = round((total_seconds % 86400) / 3600, 1)
|
|
18350
|
+
if remaining_hours == 0:
|
|
18351
|
+
return f"{days}d"
|
|
18352
|
+
else:
|
|
18353
|
+
return f"{days}d {remaining_hours}h"
|
|
18354
|
+
else:
|
|
18355
|
+
val = round(total_seconds / 604800)
|
|
18356
|
+
return f"{val}w"
|
|
18357
|
+
|
|
18358
|
+
|
|
18359
|
+
def _transform_auto_brief(brief: str | bool | None) -> str | None:
|
|
18360
|
+
if isinstance(brief, bool):
|
|
18361
|
+
if brief:
|
|
18362
|
+
return "{auto}"
|
|
18363
|
+
else:
|
|
18364
|
+
return None
|
|
18365
|
+
else:
|
|
18366
|
+
return brief
|
|
18367
|
+
|
|
18368
|
+
|
|
18369
|
+
def _process_action_str(
|
|
18370
|
+
action_str: str,
|
|
18371
|
+
step: int,
|
|
17510
18372
|
col: str | None,
|
|
17511
18373
|
value: Any,
|
|
17512
18374
|
type: str,
|
|
@@ -17688,6 +18550,14 @@ def _create_autobrief_or_failure_text(
|
|
|
17688
18550
|
for_failure=for_failure,
|
|
17689
18551
|
)
|
|
17690
18552
|
|
|
18553
|
+
if assertion_type == "data_freshness":
|
|
18554
|
+
return _create_text_data_freshness(
|
|
18555
|
+
lang=lang,
|
|
18556
|
+
column=column,
|
|
18557
|
+
value=values,
|
|
18558
|
+
for_failure=for_failure,
|
|
18559
|
+
)
|
|
18560
|
+
|
|
17691
18561
|
if assertion_type == "col_pct_null":
|
|
17692
18562
|
return _create_text_col_pct_null(
|
|
17693
18563
|
lang=lang,
|
|
@@ -17916,6 +18786,33 @@ def _create_text_col_count_match(lang: str, value: dict, for_failure: bool = Fal
|
|
|
17916
18786
|
return EXPECT_FAIL_TEXT[f"col_count_match_n_{type_}_text"][lang].format(values_text=values_text)
|
|
17917
18787
|
|
|
17918
18788
|
|
|
18789
|
+
def _create_text_data_freshness(
|
|
18790
|
+
lang: str,
|
|
18791
|
+
column: str | None,
|
|
18792
|
+
value: dict,
|
|
18793
|
+
for_failure: bool = False,
|
|
18794
|
+
) -> str:
|
|
18795
|
+
"""Create text for data_freshness validation."""
|
|
18796
|
+
type_ = _expect_failure_type(for_failure=for_failure)
|
|
18797
|
+
|
|
18798
|
+
column_text = _prep_column_text(column=column)
|
|
18799
|
+
max_age_text = _format_timedelta(value.get("max_age"))
|
|
18800
|
+
|
|
18801
|
+
if for_failure:
|
|
18802
|
+
age = value.get("age")
|
|
18803
|
+
age_text = _format_timedelta(age) if age else "unknown"
|
|
18804
|
+
return EXPECT_FAIL_TEXT[f"data_freshness_{type_}_text"][lang].format(
|
|
18805
|
+
column_text=column_text,
|
|
18806
|
+
max_age_text=max_age_text,
|
|
18807
|
+
age_text=age_text,
|
|
18808
|
+
)
|
|
18809
|
+
else:
|
|
18810
|
+
return EXPECT_FAIL_TEXT[f"data_freshness_{type_}_text"][lang].format(
|
|
18811
|
+
column_text=column_text,
|
|
18812
|
+
max_age_text=max_age_text,
|
|
18813
|
+
)
|
|
18814
|
+
|
|
18815
|
+
|
|
17919
18816
|
def _create_text_col_pct_null(
|
|
17920
18817
|
lang: str,
|
|
17921
18818
|
column: str | None,
|
|
@@ -18850,6 +19747,71 @@ def _extract_pre_argument(source: str) -> str:
|
|
|
18850
19747
|
return pre_arg
|
|
18851
19748
|
|
|
18852
19749
|
|
|
19750
|
+
def _create_governance_metadata_html(
|
|
19751
|
+
owner: str | None,
|
|
19752
|
+
consumers: list[str] | None,
|
|
19753
|
+
version: str | None,
|
|
19754
|
+
) -> str:
|
|
19755
|
+
"""
|
|
19756
|
+
Create HTML for governance metadata display in the report footer.
|
|
19757
|
+
|
|
19758
|
+
Parameters
|
|
19759
|
+
----------
|
|
19760
|
+
owner
|
|
19761
|
+
The owner of the data being validated.
|
|
19762
|
+
consumers
|
|
19763
|
+
List of consumers who depend on the data.
|
|
19764
|
+
version
|
|
19765
|
+
The version of the validation plan.
|
|
19766
|
+
|
|
19767
|
+
Returns
|
|
19768
|
+
-------
|
|
19769
|
+
str
|
|
19770
|
+
HTML string containing formatted governance metadata, or empty string if no metadata.
|
|
19771
|
+
"""
|
|
19772
|
+
if owner is None and consumers is None and version is None:
|
|
19773
|
+
return ""
|
|
19774
|
+
|
|
19775
|
+
metadata_parts = []
|
|
19776
|
+
|
|
19777
|
+
# Common style for the metadata badges (similar to timing style but slightly smaller font)
|
|
19778
|
+
badge_style = (
|
|
19779
|
+
"background-color: #FFF; color: #444; padding: 0.5em 0.5em; position: inherit; "
|
|
19780
|
+
"margin-right: 5px; border: solid 1px #999999; font-variant-numeric: tabular-nums; "
|
|
19781
|
+
"border-radius: 0; padding: 2px 10px 2px 10px; font-size: 11px;"
|
|
19782
|
+
)
|
|
19783
|
+
label_style = (
|
|
19784
|
+
"color: #777; font-weight: bold; font-size: 9px; text-transform: uppercase; "
|
|
19785
|
+
"margin-right: 3px;"
|
|
19786
|
+
)
|
|
19787
|
+
|
|
19788
|
+
if owner is not None:
|
|
19789
|
+
metadata_parts.append(
|
|
19790
|
+
f"<span style='{badge_style}'><span style='{label_style}'>Owner:</span> {owner}</span>"
|
|
19791
|
+
)
|
|
19792
|
+
|
|
19793
|
+
if consumers is not None and len(consumers) > 0:
|
|
19794
|
+
consumers_str = ", ".join(consumers)
|
|
19795
|
+
metadata_parts.append(
|
|
19796
|
+
f"<span style='{badge_style}'>"
|
|
19797
|
+
f"<span style='{label_style}'>Consumers:</span> {consumers_str}"
|
|
19798
|
+
f"</span>"
|
|
19799
|
+
)
|
|
19800
|
+
|
|
19801
|
+
if version is not None:
|
|
19802
|
+
metadata_parts.append(
|
|
19803
|
+
f"<span style='{badge_style}'>"
|
|
19804
|
+
f"<span style='{label_style}'>Version:</span> {version}"
|
|
19805
|
+
f"</span>"
|
|
19806
|
+
)
|
|
19807
|
+
|
|
19808
|
+
return (
|
|
19809
|
+
f"<div style='margin-top: 5px; margin-bottom: 5px; margin-left: 10px;'>"
|
|
19810
|
+
f"{''.join(metadata_parts)}"
|
|
19811
|
+
f"</div>"
|
|
19812
|
+
)
|
|
19813
|
+
|
|
19814
|
+
|
|
18853
19815
|
def _create_table_time_html(
|
|
18854
19816
|
time_start: datetime.datetime | None, time_end: datetime.datetime | None
|
|
18855
19817
|
) -> str:
|
|
@@ -20356,6 +21318,296 @@ def _step_report_rows_distinct(
|
|
|
20356
21318
|
return step_report
|
|
20357
21319
|
|
|
20358
21320
|
|
|
21321
|
+
def _step_report_aggregate(
|
|
21322
|
+
assertion_type: str,
|
|
21323
|
+
i: int,
|
|
21324
|
+
column: str,
|
|
21325
|
+
values: dict,
|
|
21326
|
+
all_passed: bool,
|
|
21327
|
+
val_info: dict | None,
|
|
21328
|
+
header: str,
|
|
21329
|
+
lang: str,
|
|
21330
|
+
) -> GT:
|
|
21331
|
+
"""
|
|
21332
|
+
Generate a step report for aggregate validation methods (col_sum_*, col_avg_*, col_sd_*).
|
|
21333
|
+
|
|
21334
|
+
This creates a 1-row table showing the computed aggregate value vs. the target value,
|
|
21335
|
+
along with tolerance and pass/fail status.
|
|
21336
|
+
"""
|
|
21337
|
+
|
|
21338
|
+
# Determine whether the `lang` value represents a right-to-left language
|
|
21339
|
+
is_rtl_lang = lang in RTL_LANGUAGES
|
|
21340
|
+
direction_rtl = " direction: rtl;" if is_rtl_lang else ""
|
|
21341
|
+
|
|
21342
|
+
# Parse assertion type to get aggregate function and comparison operator
|
|
21343
|
+
# Format: col_{agg}_{comp} (e.g., col_sum_eq, col_avg_gt, col_sd_le)
|
|
21344
|
+
parts = assertion_type.split("_")
|
|
21345
|
+
agg_type = parts[1] # sum, avg, sd
|
|
21346
|
+
comp_type = parts[2] # eq, gt, ge, lt, le
|
|
21347
|
+
|
|
21348
|
+
# Map aggregate type to display name
|
|
21349
|
+
agg_display = {"sum": "SUM", "avg": "AVG", "sd": "SD"}.get(agg_type, agg_type.upper())
|
|
21350
|
+
|
|
21351
|
+
# Map comparison type to symbol
|
|
21352
|
+
comp_symbols = {
|
|
21353
|
+
"eq": "=",
|
|
21354
|
+
"gt": ">",
|
|
21355
|
+
"ge": "≥",
|
|
21356
|
+
"lt": "<",
|
|
21357
|
+
"le": "≤",
|
|
21358
|
+
}
|
|
21359
|
+
comp_symbol = comp_symbols.get(comp_type, comp_type)
|
|
21360
|
+
|
|
21361
|
+
# Get computed values from val_info (stored during interrogation)
|
|
21362
|
+
if val_info is not None:
|
|
21363
|
+
actual = val_info.get("actual", None)
|
|
21364
|
+
target = val_info.get("target", None)
|
|
21365
|
+
tol = val_info.get("tol", 0)
|
|
21366
|
+
lower_bound = val_info.get("lower_bound", target)
|
|
21367
|
+
upper_bound = val_info.get("upper_bound", target)
|
|
21368
|
+
else:
|
|
21369
|
+
# Fallback if val_info is not available
|
|
21370
|
+
actual = None
|
|
21371
|
+
target = values.get("value", None)
|
|
21372
|
+
tol = values.get("tol", 0)
|
|
21373
|
+
lower_bound = target
|
|
21374
|
+
upper_bound = target
|
|
21375
|
+
|
|
21376
|
+
# Format column name for display (handle list vs string)
|
|
21377
|
+
if isinstance(column, list):
|
|
21378
|
+
column_display = column[0] if len(column) == 1 else ", ".join(column)
|
|
21379
|
+
else:
|
|
21380
|
+
column_display = str(column)
|
|
21381
|
+
|
|
21382
|
+
# Generate assertion text for header
|
|
21383
|
+
if target is not None:
|
|
21384
|
+
target_display = f"{target:,.6g}" if isinstance(target, float) else f"{target:,}"
|
|
21385
|
+
assertion_text = f"{agg_display}({column_display}) {comp_symbol} {target_display}"
|
|
21386
|
+
else:
|
|
21387
|
+
assertion_text = f"{agg_display}({column_display}) {comp_symbol} ?"
|
|
21388
|
+
|
|
21389
|
+
# Calculate difference from boundary
|
|
21390
|
+
if actual is not None and target is not None:
|
|
21391
|
+
if comp_type == "eq":
|
|
21392
|
+
# For equality, show distance from target (considering tolerance)
|
|
21393
|
+
if lower_bound == upper_bound:
|
|
21394
|
+
difference = actual - target
|
|
21395
|
+
else:
|
|
21396
|
+
# With tolerance, show distance from nearest bound
|
|
21397
|
+
if actual < lower_bound:
|
|
21398
|
+
difference = actual - lower_bound
|
|
21399
|
+
elif actual > upper_bound:
|
|
21400
|
+
difference = actual - upper_bound
|
|
21401
|
+
else:
|
|
21402
|
+
difference = 0 # Within bounds
|
|
21403
|
+
elif comp_type in ["gt", "ge"]:
|
|
21404
|
+
# Distance from lower bound (positive if passing)
|
|
21405
|
+
difference = actual - lower_bound
|
|
21406
|
+
elif comp_type in ["lt", "le"]:
|
|
21407
|
+
# Distance from upper bound (negative if passing)
|
|
21408
|
+
difference = actual - upper_bound
|
|
21409
|
+
else:
|
|
21410
|
+
difference = actual - target
|
|
21411
|
+
else:
|
|
21412
|
+
difference = None
|
|
21413
|
+
|
|
21414
|
+
# Format values for display
|
|
21415
|
+
def format_value(v):
|
|
21416
|
+
if v is None:
|
|
21417
|
+
return "—"
|
|
21418
|
+
if isinstance(v, float):
|
|
21419
|
+
return f"{v:,.6g}"
|
|
21420
|
+
return f"{v:,}"
|
|
21421
|
+
|
|
21422
|
+
# Format tolerance for display
|
|
21423
|
+
if tol == 0:
|
|
21424
|
+
tol_display = "—"
|
|
21425
|
+
elif isinstance(tol, tuple):
|
|
21426
|
+
tol_display = f"(-{tol[0]}, +{tol[1]})"
|
|
21427
|
+
else:
|
|
21428
|
+
tol_display = f"±{tol}"
|
|
21429
|
+
|
|
21430
|
+
# Format difference with sign
|
|
21431
|
+
if difference is not None:
|
|
21432
|
+
if difference == 0:
|
|
21433
|
+
diff_display = "0"
|
|
21434
|
+
elif difference > 0:
|
|
21435
|
+
diff_display = (
|
|
21436
|
+
f"+{difference:,.6g}" if isinstance(difference, float) else f"+{difference:,}"
|
|
21437
|
+
)
|
|
21438
|
+
else:
|
|
21439
|
+
diff_display = (
|
|
21440
|
+
f"{difference:,.6g}" if isinstance(difference, float) else f"{difference:,}"
|
|
21441
|
+
)
|
|
21442
|
+
else:
|
|
21443
|
+
diff_display = "—"
|
|
21444
|
+
|
|
21445
|
+
# Create pass/fail indicator
|
|
21446
|
+
if all_passed:
|
|
21447
|
+
status_html = CHECK_MARK_SPAN
|
|
21448
|
+
status_color = "#4CA64C"
|
|
21449
|
+
else:
|
|
21450
|
+
status_html = CROSS_MARK_SPAN
|
|
21451
|
+
status_color = "#CF142B"
|
|
21452
|
+
|
|
21453
|
+
# Select DataFrame library (prefer Polars, fall back to Pandas)
|
|
21454
|
+
if _is_lib_present("polars"):
|
|
21455
|
+
import polars as pl
|
|
21456
|
+
|
|
21457
|
+
df_lib = pl
|
|
21458
|
+
elif _is_lib_present("pandas"): # pragma: no cover
|
|
21459
|
+
import pandas as pd # pragma: no cover
|
|
21460
|
+
|
|
21461
|
+
df_lib = pd # pragma: no cover
|
|
21462
|
+
else: # pragma: no cover
|
|
21463
|
+
raise ImportError(
|
|
21464
|
+
"Neither Polars nor Pandas is available for step report generation"
|
|
21465
|
+
) # pragma: no cover
|
|
21466
|
+
|
|
21467
|
+
# Create the data for the 1-row table
|
|
21468
|
+
report_data = df_lib.DataFrame(
|
|
21469
|
+
{
|
|
21470
|
+
"actual": [format_value(actual)],
|
|
21471
|
+
"target": [format_value(target)],
|
|
21472
|
+
"tolerance": [tol_display],
|
|
21473
|
+
"difference": [diff_display],
|
|
21474
|
+
"status": [status_html],
|
|
21475
|
+
}
|
|
21476
|
+
)
|
|
21477
|
+
|
|
21478
|
+
# Create GT table with styling matching preview() and other step reports
|
|
21479
|
+
step_report = (
|
|
21480
|
+
GT(report_data, id="pb_step_tbl")
|
|
21481
|
+
.opt_table_font(font=google_font(name="IBM Plex Sans"))
|
|
21482
|
+
.opt_align_table_header(align="left")
|
|
21483
|
+
.cols_label(
|
|
21484
|
+
actual="ACTUAL",
|
|
21485
|
+
target="EXPECTED",
|
|
21486
|
+
tolerance="TOL",
|
|
21487
|
+
difference="DIFFERENCE",
|
|
21488
|
+
status="",
|
|
21489
|
+
)
|
|
21490
|
+
.cols_align(align="center")
|
|
21491
|
+
.fmt_markdown(columns=["actual", "target", "tolerance", "difference", "status"])
|
|
21492
|
+
.tab_style(
|
|
21493
|
+
style=style.text(color="black", font=google_font(name="IBM Plex Mono"), size="13px"),
|
|
21494
|
+
locations=loc.body(columns=["actual", "target", "tolerance", "difference"]),
|
|
21495
|
+
)
|
|
21496
|
+
.tab_style(
|
|
21497
|
+
style=style.text(size="13px"),
|
|
21498
|
+
locations=loc.body(columns="status"),
|
|
21499
|
+
)
|
|
21500
|
+
.tab_style(
|
|
21501
|
+
style=style.text(color="gray20", font=google_font(name="IBM Plex Mono"), size="12px"),
|
|
21502
|
+
locations=loc.column_labels(),
|
|
21503
|
+
)
|
|
21504
|
+
.tab_style(
|
|
21505
|
+
style=style.borders(
|
|
21506
|
+
sides=["top", "bottom"], color="#E9E9E9", style="solid", weight="1px"
|
|
21507
|
+
),
|
|
21508
|
+
locations=loc.body(),
|
|
21509
|
+
)
|
|
21510
|
+
.tab_options(
|
|
21511
|
+
table_body_vlines_style="solid",
|
|
21512
|
+
table_body_vlines_width="1px",
|
|
21513
|
+
table_body_vlines_color="#E9E9E9",
|
|
21514
|
+
column_labels_vlines_style="solid",
|
|
21515
|
+
column_labels_vlines_width="1px",
|
|
21516
|
+
column_labels_vlines_color="#F2F2F2",
|
|
21517
|
+
)
|
|
21518
|
+
.cols_width(
|
|
21519
|
+
cases={
|
|
21520
|
+
"actual": "200px",
|
|
21521
|
+
"target": "200px",
|
|
21522
|
+
"tolerance": "150px",
|
|
21523
|
+
"difference": "200px",
|
|
21524
|
+
"status": "50px",
|
|
21525
|
+
}
|
|
21526
|
+
)
|
|
21527
|
+
)
|
|
21528
|
+
|
|
21529
|
+
# Apply styling based on pass/fail
|
|
21530
|
+
if all_passed:
|
|
21531
|
+
step_report = step_report.tab_style(
|
|
21532
|
+
style=[
|
|
21533
|
+
style.text(color="#006400"),
|
|
21534
|
+
style.fill(color="#4CA64C33"),
|
|
21535
|
+
],
|
|
21536
|
+
locations=loc.body(columns="status"),
|
|
21537
|
+
)
|
|
21538
|
+
else:
|
|
21539
|
+
step_report = step_report.tab_style(
|
|
21540
|
+
style=[
|
|
21541
|
+
style.text(color="#B22222"),
|
|
21542
|
+
style.fill(color="#FFC1C159"),
|
|
21543
|
+
],
|
|
21544
|
+
locations=loc.body(columns="status"),
|
|
21545
|
+
)
|
|
21546
|
+
|
|
21547
|
+
# If the version of `great_tables` is `>=0.17.0` then disable Quarto table processing
|
|
21548
|
+
if version("great_tables") >= "0.17.0":
|
|
21549
|
+
step_report = step_report.tab_options(quarto_disable_processing=True)
|
|
21550
|
+
|
|
21551
|
+
# If no header requested, return the table as-is
|
|
21552
|
+
if header is None:
|
|
21553
|
+
return step_report
|
|
21554
|
+
|
|
21555
|
+
# Create header content
|
|
21556
|
+
assertion_header_text = STEP_REPORT_TEXT["assertion_header_text"][lang]
|
|
21557
|
+
|
|
21558
|
+
# Wrap assertion text in styled code tag
|
|
21559
|
+
assertion_code = (
|
|
21560
|
+
f"<code style='color: #303030; font-family: monospace; font-size: smaller;'>"
|
|
21561
|
+
f"{assertion_text}</code>"
|
|
21562
|
+
)
|
|
21563
|
+
|
|
21564
|
+
if all_passed:
|
|
21565
|
+
title = STEP_REPORT_TEXT["report_for_step_i"][lang].format(i=i) + " " + CHECK_MARK_SPAN
|
|
21566
|
+
result_stmt = STEP_REPORT_TEXT.get("agg_success_statement", {}).get(
|
|
21567
|
+
lang,
|
|
21568
|
+
f"The aggregate value for column <code>{column_display}</code> satisfies the condition.",
|
|
21569
|
+
)
|
|
21570
|
+
if isinstance(result_stmt, str) and "{column}" in result_stmt:
|
|
21571
|
+
result_stmt = result_stmt.format(column=column_display)
|
|
21572
|
+
else:
|
|
21573
|
+
title = STEP_REPORT_TEXT["report_for_step_i"][lang].format(i=i) + " " + CROSS_MARK_SPAN
|
|
21574
|
+
result_stmt = STEP_REPORT_TEXT.get("agg_failure_statement", {}).get(
|
|
21575
|
+
lang,
|
|
21576
|
+
f"The aggregate value for column <code>{column_display}</code> does not satisfy the condition.",
|
|
21577
|
+
)
|
|
21578
|
+
if isinstance(result_stmt, str) and "{column}" in result_stmt:
|
|
21579
|
+
result_stmt = result_stmt.format(column=column_display)
|
|
21580
|
+
|
|
21581
|
+
details = (
|
|
21582
|
+
f"<div style='font-size: 13.6px; {direction_rtl}'>"
|
|
21583
|
+
"<div style='padding-top: 7px;'>"
|
|
21584
|
+
f"{assertion_header_text} <span style='border-style: solid; border-width: thin; "
|
|
21585
|
+
"border-color: lightblue; padding-left: 2px; padding-right: 2px;'>"
|
|
21586
|
+
"<code style='color: #303030; background-color: transparent; "
|
|
21587
|
+
f"position: relative; bottom: 1px;'>{assertion_code}</code></span>"
|
|
21588
|
+
"</div>"
|
|
21589
|
+
"<div style='padding-top: 7px;'>"
|
|
21590
|
+
f"{result_stmt}"
|
|
21591
|
+
"</div>"
|
|
21592
|
+
"</div>"
|
|
21593
|
+
)
|
|
21594
|
+
|
|
21595
|
+
# Generate the default template text for the header when `":default:"` is used
|
|
21596
|
+
if header == ":default:":
|
|
21597
|
+
header = "{title}{details}"
|
|
21598
|
+
|
|
21599
|
+
# Use commonmark to convert the header text to HTML
|
|
21600
|
+
header = commonmark.commonmark(header)
|
|
21601
|
+
|
|
21602
|
+
# Place any templated text in the header
|
|
21603
|
+
header = header.format(title=title, details=details)
|
|
21604
|
+
|
|
21605
|
+
# Create the header with `header` string
|
|
21606
|
+
step_report = step_report.tab_header(title=md(header))
|
|
21607
|
+
|
|
21608
|
+
return step_report
|
|
21609
|
+
|
|
21610
|
+
|
|
20359
21611
|
def _step_report_schema_in_order(
|
|
20360
21612
|
step: int, schema_info: dict, header: str | None, lang: str, debug_return_df: bool = False
|
|
20361
21613
|
) -> GT | Any:
|