pointblank 0.18.0__py3-none-any.whl → 0.20.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +44 -1
- pointblank/_constants.py +258 -166
- pointblank/_constants_translations.py +378 -0
- pointblank/_interrogation.py +204 -0
- pointblank/_utils_llms_txt.py +20 -0
- pointblank/data/api-docs.txt +793 -1
- pointblank/field.py +1507 -0
- pointblank/generate/__init__.py +17 -0
- pointblank/generate/base.py +49 -0
- pointblank/generate/generators.py +573 -0
- pointblank/generate/regex.py +217 -0
- pointblank/locales/__init__.py +1476 -0
- pointblank/locales/data/AR/address.json +73 -0
- pointblank/locales/data/AR/company.json +60 -0
- pointblank/locales/data/AR/internet.json +19 -0
- pointblank/locales/data/AR/misc.json +7 -0
- pointblank/locales/data/AR/person.json +39 -0
- pointblank/locales/data/AR/text.json +38 -0
- pointblank/locales/data/AT/address.json +84 -0
- pointblank/locales/data/AT/company.json +65 -0
- pointblank/locales/data/AT/internet.json +20 -0
- pointblank/locales/data/AT/misc.json +8 -0
- pointblank/locales/data/AT/person.json +17 -0
- pointblank/locales/data/AT/text.json +35 -0
- pointblank/locales/data/AU/address.json +83 -0
- pointblank/locales/data/AU/company.json +65 -0
- pointblank/locales/data/AU/internet.json +20 -0
- pointblank/locales/data/AU/misc.json +8 -0
- pointblank/locales/data/AU/person.json +17 -0
- pointblank/locales/data/AU/text.json +35 -0
- pointblank/locales/data/BE/address.json +225 -0
- pointblank/locales/data/BE/company.json +129 -0
- pointblank/locales/data/BE/internet.json +36 -0
- pointblank/locales/data/BE/misc.json +6 -0
- pointblank/locales/data/BE/person.json +62 -0
- pointblank/locales/data/BE/text.json +38 -0
- pointblank/locales/data/BG/address.json +75 -0
- pointblank/locales/data/BG/company.json +60 -0
- pointblank/locales/data/BG/internet.json +19 -0
- pointblank/locales/data/BG/misc.json +7 -0
- pointblank/locales/data/BG/person.json +40 -0
- pointblank/locales/data/BG/text.json +38 -0
- pointblank/locales/data/BR/address.json +98 -0
- pointblank/locales/data/BR/company.json +65 -0
- pointblank/locales/data/BR/internet.json +20 -0
- pointblank/locales/data/BR/misc.json +8 -0
- pointblank/locales/data/BR/person.json +17 -0
- pointblank/locales/data/BR/text.json +35 -0
- pointblank/locales/data/CA/address.json +747 -0
- pointblank/locales/data/CA/company.json +120 -0
- pointblank/locales/data/CA/internet.json +24 -0
- pointblank/locales/data/CA/misc.json +11 -0
- pointblank/locales/data/CA/person.json +1033 -0
- pointblank/locales/data/CA/text.json +58 -0
- pointblank/locales/data/CH/address.json +184 -0
- pointblank/locales/data/CH/company.json +112 -0
- pointblank/locales/data/CH/internet.json +20 -0
- pointblank/locales/data/CH/misc.json +10 -0
- pointblank/locales/data/CH/person.json +64 -0
- pointblank/locales/data/CH/text.json +45 -0
- pointblank/locales/data/CL/address.json +71 -0
- pointblank/locales/data/CL/company.json +60 -0
- pointblank/locales/data/CL/internet.json +19 -0
- pointblank/locales/data/CL/misc.json +7 -0
- pointblank/locales/data/CL/person.json +38 -0
- pointblank/locales/data/CL/text.json +38 -0
- pointblank/locales/data/CN/address.json +124 -0
- pointblank/locales/data/CN/company.json +76 -0
- pointblank/locales/data/CN/internet.json +20 -0
- pointblank/locales/data/CN/misc.json +8 -0
- pointblank/locales/data/CN/person.json +50 -0
- pointblank/locales/data/CN/text.json +38 -0
- pointblank/locales/data/CO/address.json +76 -0
- pointblank/locales/data/CO/company.json +60 -0
- pointblank/locales/data/CO/internet.json +19 -0
- pointblank/locales/data/CO/misc.json +7 -0
- pointblank/locales/data/CO/person.json +38 -0
- pointblank/locales/data/CO/text.json +38 -0
- pointblank/locales/data/CY/address.json +62 -0
- pointblank/locales/data/CY/company.json +60 -0
- pointblank/locales/data/CY/internet.json +19 -0
- pointblank/locales/data/CY/misc.json +7 -0
- pointblank/locales/data/CY/person.json +38 -0
- pointblank/locales/data/CY/text.json +38 -0
- pointblank/locales/data/CZ/address.json +70 -0
- pointblank/locales/data/CZ/company.json +61 -0
- pointblank/locales/data/CZ/internet.json +19 -0
- pointblank/locales/data/CZ/misc.json +7 -0
- pointblank/locales/data/CZ/person.json +40 -0
- pointblank/locales/data/CZ/text.json +38 -0
- pointblank/locales/data/DE/address.json +756 -0
- pointblank/locales/data/DE/company.json +101 -0
- pointblank/locales/data/DE/internet.json +22 -0
- pointblank/locales/data/DE/misc.json +11 -0
- pointblank/locales/data/DE/person.json +1026 -0
- pointblank/locales/data/DE/text.json +50 -0
- pointblank/locales/data/DK/address.json +231 -0
- pointblank/locales/data/DK/company.json +65 -0
- pointblank/locales/data/DK/internet.json +20 -0
- pointblank/locales/data/DK/misc.json +7 -0
- pointblank/locales/data/DK/person.json +45 -0
- pointblank/locales/data/DK/text.json +43 -0
- pointblank/locales/data/EE/address.json +69 -0
- pointblank/locales/data/EE/company.json +60 -0
- pointblank/locales/data/EE/internet.json +19 -0
- pointblank/locales/data/EE/misc.json +7 -0
- pointblank/locales/data/EE/person.json +39 -0
- pointblank/locales/data/EE/text.json +38 -0
- pointblank/locales/data/ES/address.json +3086 -0
- pointblank/locales/data/ES/company.json +644 -0
- pointblank/locales/data/ES/internet.json +25 -0
- pointblank/locales/data/ES/misc.json +11 -0
- pointblank/locales/data/ES/person.json +488 -0
- pointblank/locales/data/ES/text.json +49 -0
- pointblank/locales/data/FI/address.json +93 -0
- pointblank/locales/data/FI/company.json +65 -0
- pointblank/locales/data/FI/internet.json +20 -0
- pointblank/locales/data/FI/misc.json +8 -0
- pointblank/locales/data/FI/person.json +17 -0
- pointblank/locales/data/FI/text.json +35 -0
- pointblank/locales/data/FR/address.json +619 -0
- pointblank/locales/data/FR/company.json +111 -0
- pointblank/locales/data/FR/internet.json +22 -0
- pointblank/locales/data/FR/misc.json +11 -0
- pointblank/locales/data/FR/person.json +1066 -0
- pointblank/locales/data/FR/text.json +50 -0
- pointblank/locales/data/GB/address.json +5759 -0
- pointblank/locales/data/GB/company.json +131 -0
- pointblank/locales/data/GB/internet.json +24 -0
- pointblank/locales/data/GB/misc.json +45 -0
- pointblank/locales/data/GB/person.json +578 -0
- pointblank/locales/data/GB/text.json +61 -0
- pointblank/locales/data/GR/address.json +68 -0
- pointblank/locales/data/GR/company.json +61 -0
- pointblank/locales/data/GR/internet.json +19 -0
- pointblank/locales/data/GR/misc.json +7 -0
- pointblank/locales/data/GR/person.json +39 -0
- pointblank/locales/data/GR/text.json +38 -0
- pointblank/locales/data/HK/address.json +79 -0
- pointblank/locales/data/HK/company.json +69 -0
- pointblank/locales/data/HK/internet.json +19 -0
- pointblank/locales/data/HK/misc.json +7 -0
- pointblank/locales/data/HK/person.json +42 -0
- pointblank/locales/data/HK/text.json +38 -0
- pointblank/locales/data/HR/address.json +73 -0
- pointblank/locales/data/HR/company.json +60 -0
- pointblank/locales/data/HR/internet.json +19 -0
- pointblank/locales/data/HR/misc.json +7 -0
- pointblank/locales/data/HR/person.json +38 -0
- pointblank/locales/data/HR/text.json +38 -0
- pointblank/locales/data/HU/address.json +70 -0
- pointblank/locales/data/HU/company.json +61 -0
- pointblank/locales/data/HU/internet.json +19 -0
- pointblank/locales/data/HU/misc.json +7 -0
- pointblank/locales/data/HU/person.json +40 -0
- pointblank/locales/data/HU/text.json +38 -0
- pointblank/locales/data/ID/address.json +68 -0
- pointblank/locales/data/ID/company.json +61 -0
- pointblank/locales/data/ID/internet.json +19 -0
- pointblank/locales/data/ID/misc.json +7 -0
- pointblank/locales/data/ID/person.json +40 -0
- pointblank/locales/data/ID/text.json +38 -0
- pointblank/locales/data/IE/address.json +643 -0
- pointblank/locales/data/IE/company.json +140 -0
- pointblank/locales/data/IE/internet.json +24 -0
- pointblank/locales/data/IE/misc.json +44 -0
- pointblank/locales/data/IE/person.json +55 -0
- pointblank/locales/data/IE/text.json +60 -0
- pointblank/locales/data/IN/address.json +92 -0
- pointblank/locales/data/IN/company.json +65 -0
- pointblank/locales/data/IN/internet.json +20 -0
- pointblank/locales/data/IN/misc.json +8 -0
- pointblank/locales/data/IN/person.json +52 -0
- pointblank/locales/data/IN/text.json +39 -0
- pointblank/locales/data/IS/address.json +63 -0
- pointblank/locales/data/IS/company.json +61 -0
- pointblank/locales/data/IS/internet.json +19 -0
- pointblank/locales/data/IS/misc.json +7 -0
- pointblank/locales/data/IS/person.json +44 -0
- pointblank/locales/data/IS/text.json +38 -0
- pointblank/locales/data/IT/address.json +192 -0
- pointblank/locales/data/IT/company.json +137 -0
- pointblank/locales/data/IT/internet.json +20 -0
- pointblank/locales/data/IT/misc.json +10 -0
- pointblank/locales/data/IT/person.json +70 -0
- pointblank/locales/data/IT/text.json +44 -0
- pointblank/locales/data/JP/address.json +713 -0
- pointblank/locales/data/JP/company.json +113 -0
- pointblank/locales/data/JP/internet.json +22 -0
- pointblank/locales/data/JP/misc.json +10 -0
- pointblank/locales/data/JP/person.json +1057 -0
- pointblank/locales/data/JP/text.json +51 -0
- pointblank/locales/data/KR/address.json +77 -0
- pointblank/locales/data/KR/company.json +68 -0
- pointblank/locales/data/KR/internet.json +19 -0
- pointblank/locales/data/KR/misc.json +7 -0
- pointblank/locales/data/KR/person.json +40 -0
- pointblank/locales/data/KR/text.json +38 -0
- pointblank/locales/data/LT/address.json +66 -0
- pointblank/locales/data/LT/company.json +60 -0
- pointblank/locales/data/LT/internet.json +19 -0
- pointblank/locales/data/LT/misc.json +7 -0
- pointblank/locales/data/LT/person.json +42 -0
- pointblank/locales/data/LT/text.json +38 -0
- pointblank/locales/data/LU/address.json +66 -0
- pointblank/locales/data/LU/company.json +60 -0
- pointblank/locales/data/LU/internet.json +19 -0
- pointblank/locales/data/LU/misc.json +7 -0
- pointblank/locales/data/LU/person.json +38 -0
- pointblank/locales/data/LU/text.json +38 -0
- pointblank/locales/data/LV/address.json +62 -0
- pointblank/locales/data/LV/company.json +60 -0
- pointblank/locales/data/LV/internet.json +19 -0
- pointblank/locales/data/LV/misc.json +7 -0
- pointblank/locales/data/LV/person.json +40 -0
- pointblank/locales/data/LV/text.json +38 -0
- pointblank/locales/data/MT/address.json +61 -0
- pointblank/locales/data/MT/company.json +60 -0
- pointblank/locales/data/MT/internet.json +19 -0
- pointblank/locales/data/MT/misc.json +7 -0
- pointblank/locales/data/MT/person.json +38 -0
- pointblank/locales/data/MT/text.json +38 -0
- pointblank/locales/data/MX/address.json +100 -0
- pointblank/locales/data/MX/company.json +65 -0
- pointblank/locales/data/MX/internet.json +20 -0
- pointblank/locales/data/MX/misc.json +8 -0
- pointblank/locales/data/MX/person.json +18 -0
- pointblank/locales/data/MX/text.json +39 -0
- pointblank/locales/data/NL/address.json +1517 -0
- pointblank/locales/data/NL/company.json +133 -0
- pointblank/locales/data/NL/internet.json +44 -0
- pointblank/locales/data/NL/misc.json +55 -0
- pointblank/locales/data/NL/person.json +365 -0
- pointblank/locales/data/NL/text.json +210 -0
- pointblank/locales/data/NO/address.json +86 -0
- pointblank/locales/data/NO/company.json +66 -0
- pointblank/locales/data/NO/internet.json +20 -0
- pointblank/locales/data/NO/misc.json +8 -0
- pointblank/locales/data/NO/person.json +17 -0
- pointblank/locales/data/NO/text.json +35 -0
- pointblank/locales/data/NZ/address.json +90 -0
- pointblank/locales/data/NZ/company.json +65 -0
- pointblank/locales/data/NZ/internet.json +20 -0
- pointblank/locales/data/NZ/misc.json +8 -0
- pointblank/locales/data/NZ/person.json +17 -0
- pointblank/locales/data/NZ/text.json +39 -0
- pointblank/locales/data/PH/address.json +67 -0
- pointblank/locales/data/PH/company.json +61 -0
- pointblank/locales/data/PH/internet.json +19 -0
- pointblank/locales/data/PH/misc.json +7 -0
- pointblank/locales/data/PH/person.json +40 -0
- pointblank/locales/data/PH/text.json +38 -0
- pointblank/locales/data/PL/address.json +91 -0
- pointblank/locales/data/PL/company.json +65 -0
- pointblank/locales/data/PL/internet.json +20 -0
- pointblank/locales/data/PL/misc.json +8 -0
- pointblank/locales/data/PL/person.json +17 -0
- pointblank/locales/data/PL/text.json +35 -0
- pointblank/locales/data/PT/address.json +90 -0
- pointblank/locales/data/PT/company.json +65 -0
- pointblank/locales/data/PT/internet.json +20 -0
- pointblank/locales/data/PT/misc.json +8 -0
- pointblank/locales/data/PT/person.json +17 -0
- pointblank/locales/data/PT/text.json +35 -0
- pointblank/locales/data/RO/address.json +73 -0
- pointblank/locales/data/RO/company.json +61 -0
- pointblank/locales/data/RO/internet.json +19 -0
- pointblank/locales/data/RO/misc.json +7 -0
- pointblank/locales/data/RO/person.json +40 -0
- pointblank/locales/data/RO/text.json +38 -0
- pointblank/locales/data/RU/address.json +74 -0
- pointblank/locales/data/RU/company.json +60 -0
- pointblank/locales/data/RU/internet.json +19 -0
- pointblank/locales/data/RU/misc.json +7 -0
- pointblank/locales/data/RU/person.json +38 -0
- pointblank/locales/data/RU/text.json +38 -0
- pointblank/locales/data/SE/address.json +247 -0
- pointblank/locales/data/SE/company.json +65 -0
- pointblank/locales/data/SE/internet.json +20 -0
- pointblank/locales/data/SE/misc.json +7 -0
- pointblank/locales/data/SE/person.json +45 -0
- pointblank/locales/data/SE/text.json +43 -0
- pointblank/locales/data/SI/address.json +67 -0
- pointblank/locales/data/SI/company.json +60 -0
- pointblank/locales/data/SI/internet.json +19 -0
- pointblank/locales/data/SI/misc.json +7 -0
- pointblank/locales/data/SI/person.json +38 -0
- pointblank/locales/data/SI/text.json +38 -0
- pointblank/locales/data/SK/address.json +64 -0
- pointblank/locales/data/SK/company.json +60 -0
- pointblank/locales/data/SK/internet.json +19 -0
- pointblank/locales/data/SK/misc.json +7 -0
- pointblank/locales/data/SK/person.json +38 -0
- pointblank/locales/data/SK/text.json +38 -0
- pointblank/locales/data/TR/address.json +105 -0
- pointblank/locales/data/TR/company.json +65 -0
- pointblank/locales/data/TR/internet.json +20 -0
- pointblank/locales/data/TR/misc.json +8 -0
- pointblank/locales/data/TR/person.json +17 -0
- pointblank/locales/data/TR/text.json +35 -0
- pointblank/locales/data/TW/address.json +86 -0
- pointblank/locales/data/TW/company.json +69 -0
- pointblank/locales/data/TW/internet.json +19 -0
- pointblank/locales/data/TW/misc.json +7 -0
- pointblank/locales/data/TW/person.json +42 -0
- pointblank/locales/data/TW/text.json +38 -0
- pointblank/locales/data/US/address.json +996 -0
- pointblank/locales/data/US/company.json +131 -0
- pointblank/locales/data/US/internet.json +22 -0
- pointblank/locales/data/US/misc.json +11 -0
- pointblank/locales/data/US/person.json +1092 -0
- pointblank/locales/data/US/text.json +56 -0
- pointblank/locales/data/_shared/misc.json +42 -0
- pointblank/schema.py +339 -2
- pointblank/validate.py +1263 -11
- {pointblank-0.18.0.dist-info → pointblank-0.20.0.dist-info}/METADATA +45 -1
- pointblank-0.20.0.dist-info/RECORD +366 -0
- {pointblank-0.18.0.dist-info → pointblank-0.20.0.dist-info}/WHEEL +1 -1
- pointblank-0.18.0.dist-info/RECORD +0 -59
- {pointblank-0.18.0.dist-info → pointblank-0.20.0.dist-info}/entry_points.txt +0 -0
- {pointblank-0.18.0.dist-info → pointblank-0.20.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.18.0.dist-info → pointblank-0.20.0.dist-info}/top_level.txt +0 -0
pointblank/data/api-docs.txt
CHANGED
|
@@ -11,7 +11,7 @@ failure thresholds (using the `Thresholds` class or through shorthands for this
|
|
|
11
11
|
`Validate` class has numerous methods for defining validation steps and for obtaining
|
|
12
12
|
post-interrogation metrics and data.
|
|
13
13
|
|
|
14
|
-
Validate(data: 'IntoDataFrame', reference: 'IntoFrame | None' = None, tbl_name: 'str | None' = None, label: 'str | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, final_actions: 'FinalActions | None' = None, brief: 'str | bool | None' = None, lang: 'str | None' = None, locale: 'str | None' = None) -> None
|
|
14
|
+
Validate(data: 'IntoDataFrame', reference: 'IntoFrame | None' = None, tbl_name: 'str | None' = None, label: 'str | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, final_actions: 'FinalActions | None' = None, brief: 'str | bool | None' = None, lang: 'str | None' = None, locale: 'str | None' = None, owner: 'str | None' = None, consumers: 'str | list[str] | None' = None, version: 'str | None' = None) -> None
|
|
15
15
|
|
|
16
16
|
Workflow for defining a set of validations on a table and interrogating for results.
|
|
17
17
|
|
|
@@ -99,6 +99,18 @@ Validate(data: 'IntoDataFrame', reference: 'IntoFrame | None' = None, tbl_name:
|
|
|
99
99
|
locale's rules. Examples include `"en-US"` for English (United States) and `"fr-FR"` for
|
|
100
100
|
French (France). More simply, this can be a language identifier without a designation of
|
|
101
101
|
territory, like `"es"` for Spanish.
|
|
102
|
+
owner
|
|
103
|
+
An optional string identifying the owner of the data being validated. This is useful for
|
|
104
|
+
governance purposes, indicating who is responsible for the quality and maintenance of the
|
|
105
|
+
data. For example, `"data-platform-team"` or `"analytics-engineering"`.
|
|
106
|
+
consumers
|
|
107
|
+
An optional string or list of strings identifying who depends on or consumes this data.
|
|
108
|
+
This helps document data dependencies and can be useful for impact analysis when data
|
|
109
|
+
quality issues are detected. For example, `"ml-team"` or `["ml-team", "analytics"]`.
|
|
110
|
+
version
|
|
111
|
+
An optional string representing the version of the validation plan or data contract. This
|
|
112
|
+
supports semantic versioning (e.g., `"1.0.0"`, `"2.1.0"`) and is useful for tracking changes
|
|
113
|
+
to validation rules over time and for organizational governance.
|
|
102
114
|
|
|
103
115
|
Returns
|
|
104
116
|
-------
|
|
@@ -8289,6 +8301,271 @@ col_pct_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnS
|
|
|
8289
8301
|
calculates to 2.7 to 3.9, which rounds down to 2 to 3 rows).
|
|
8290
8302
|
|
|
8291
8303
|
|
|
8304
|
+
data_freshness(self, column: 'str', max_age: 'str | datetime.timedelta', reference_time: 'datetime.datetime | str | None' = None, timezone: 'str | None' = None, allow_tz_mismatch: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
8305
|
+
|
|
8306
|
+
Validate that data in a datetime column is not older than a specified maximum age.
|
|
8307
|
+
|
|
8308
|
+
The `data_freshness()` validation method checks whether the most recent timestamp in the
|
|
8309
|
+
specified datetime column is within the allowed `max_age=` from the `reference_time=` (which
|
|
8310
|
+
defaults to the current time). This is useful for ensuring data pipelines are delivering
|
|
8311
|
+
fresh data and for enforcing data SLAs.
|
|
8312
|
+
|
|
8313
|
+
This method helps detect stale data by comparing the maximum (most recent) value in a
|
|
8314
|
+
datetime column against an expected freshness threshold.
|
|
8315
|
+
|
|
8316
|
+
Parameters
|
|
8317
|
+
----------
|
|
8318
|
+
column
|
|
8319
|
+
The name of the datetime column to check for freshness. This column should contain
|
|
8320
|
+
date or datetime values.
|
|
8321
|
+
max_age
|
|
8322
|
+
The maximum allowed age of the data. Can be specified as: (1) a string with a
|
|
8323
|
+
human-readable duration like `"24 hours"`, `"1 day"`, `"30 minutes"`, `"2 weeks"`, etc.
|
|
8324
|
+
(supported units: `seconds`, `minutes`, `hours`, `days`, `weeks`), or (2) a
|
|
8325
|
+
`datetime.timedelta` object for precise control.
|
|
8326
|
+
reference_time
|
|
8327
|
+
The reference point in time to compare against. Defaults to `None`, which uses the
|
|
8328
|
+
current time (UTC if `timezone=` is not specified). Can be: (1) a `datetime.datetime`
|
|
8329
|
+
object (timezone-aware recommended), (2) a string in ISO 8601 format (e.g.,
|
|
8330
|
+
`"2024-01-15T10:30:00"` or `"2024-01-15T10:30:00+05:30"`), or (3) `None` to use the
|
|
8331
|
+
current time.
|
|
8332
|
+
timezone
|
|
8333
|
+
The timezone to use for interpreting the data and reference time. Accepts IANA
|
|
8334
|
+
timezone names (e.g., `"America/New_York"`), hour offsets (e.g., `"-7"`), or ISO 8601
|
|
8335
|
+
offsets (e.g., `"-07:00"`). When `None` (default), naive datetimes are treated as UTC.
|
|
8336
|
+
See the *The `timezone=` Parameter* section for details.
|
|
8337
|
+
allow_tz_mismatch
|
|
8338
|
+
Whether to allow timezone mismatches between the column data and reference time.
|
|
8339
|
+
By default (`False`), a warning note is added when comparing timezone-naive with
|
|
8340
|
+
timezone-aware datetimes. Set to `True` to suppress these warnings.
|
|
8341
|
+
pre
|
|
8342
|
+
An optional preprocessing function or lambda to apply to the data table during
|
|
8343
|
+
interrogation. This function should take a table as input and return a modified table.
|
|
8344
|
+
thresholds
|
|
8345
|
+
Set threshold failure levels for reporting and reacting to exceedences of the levels.
|
|
8346
|
+
The thresholds are set at the step level and will override any global thresholds set in
|
|
8347
|
+
`Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
|
|
8348
|
+
be set locally and global thresholds (if any) will take effect.
|
|
8349
|
+
actions
|
|
8350
|
+
Optional actions to take when the validation step meets or exceeds any set threshold
|
|
8351
|
+
levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
|
|
8352
|
+
define the actions.
|
|
8353
|
+
brief
|
|
8354
|
+
An optional brief description of the validation step that will be displayed in the
|
|
8355
|
+
reporting table. You can use the templating elements like `"{step}"` to insert
|
|
8356
|
+
the step number, or `"{auto}"` to include an automatically generated brief. If `True`
|
|
8357
|
+
the entire brief will be automatically generated. If `None` (the default) then there
|
|
8358
|
+
won't be a brief.
|
|
8359
|
+
active
|
|
8360
|
+
A boolean value indicating whether the validation step should be active. Using `False`
|
|
8361
|
+
will make the validation step inactive (still reporting its presence and keeping indexes
|
|
8362
|
+
for the steps unchanged).
|
|
8363
|
+
|
|
8364
|
+
Returns
|
|
8365
|
+
-------
|
|
8366
|
+
Validate
|
|
8367
|
+
The `Validate` object with the added validation step.
|
|
8368
|
+
|
|
8369
|
+
How Timezones Affect Freshness Checks
|
|
8370
|
+
-------------------------------------
|
|
8371
|
+
Freshness validation involves comparing two times: the **data time** (the most recent
|
|
8372
|
+
timestamp in your column) and the **execution time** (when and where the validation runs).
|
|
8373
|
+
Timezone confusion typically arises because these two times may originate from different
|
|
8374
|
+
contexts.
|
|
8375
|
+
|
|
8376
|
+
Consider these common scenarios:
|
|
8377
|
+
|
|
8378
|
+
- your data timestamps are stored in UTC (common for databases), but you're running
|
|
8379
|
+
validation on your laptop in New York (Eastern Time)
|
|
8380
|
+
- you develop and test validation locally, then deploy it to a cloud workflow that runs
|
|
8381
|
+
in UTC—suddenly your 'same' validation behaves differently
|
|
8382
|
+
- your data comes from servers in multiple regions, each recording timestamps in their
|
|
8383
|
+
local timezone
|
|
8384
|
+
|
|
8385
|
+
The `timezone=` parameter exists to solve this problem by establishing a single, explicit
|
|
8386
|
+
timezone context for the freshness comparison. When you specify a timezone, Pointblank
|
|
8387
|
+
interprets both the data timestamps (if naive) and the execution time in that timezone,
|
|
8388
|
+
ensuring consistent behavior whether you run validation on your laptop or in a cloud
|
|
8389
|
+
workflow.
|
|
8390
|
+
|
|
8391
|
+
**Scenario 1: Data has timezone-aware datetimes**
|
|
8392
|
+
|
|
8393
|
+
```python
|
|
8394
|
+
# Your data column has values like: 2024-01-15 10:30:00+00:00 (UTC)
|
|
8395
|
+
# Comparison is straightforward as both sides have explicit timezones
|
|
8396
|
+
.data_freshness(column="updated_at", max_age="24 hours")
|
|
8397
|
+
```
|
|
8398
|
+
|
|
8399
|
+
**Scenario 2: Data has naive datetimes (no timezone)**
|
|
8400
|
+
|
|
8401
|
+
```python
|
|
8402
|
+
# Your data column has values like: 2024-01-15 10:30:00 (no timezone)
|
|
8403
|
+
# Specify the timezone the data was recorded in:
|
|
8404
|
+
.data_freshness(column="updated_at", max_age="24 hours", timezone="America/New_York")
|
|
8405
|
+
```
|
|
8406
|
+
|
|
8407
|
+
**Scenario 3: Ensuring consistent behavior across environments**
|
|
8408
|
+
|
|
8409
|
+
```python
|
|
8410
|
+
# Pin the timezone to ensure identical results whether running locally or in the cloud
|
|
8411
|
+
.data_freshness(
|
|
8412
|
+
column="updated_at",
|
|
8413
|
+
max_age="24 hours",
|
|
8414
|
+
timezone="UTC", # Explicit timezone removes environment dependence
|
|
8415
|
+
)
|
|
8416
|
+
```
|
|
8417
|
+
|
|
8418
|
+
The `timezone=` Parameter
|
|
8419
|
+
---------------------------
|
|
8420
|
+
The `timezone=` parameter accepts several convenient formats, making it easy to specify
|
|
8421
|
+
timezones in whatever way is most natural for your use case. The following examples
|
|
8422
|
+
illustrate the three supported input styles.
|
|
8423
|
+
|
|
8424
|
+
**IANA Timezone Names** (recommended for regions with daylight saving time):
|
|
8425
|
+
|
|
8426
|
+
```python
|
|
8427
|
+
timezone="America/New_York" # Eastern Time (handles DST automatically)
|
|
8428
|
+
timezone="Europe/London" # UK time
|
|
8429
|
+
timezone="Asia/Tokyo" # Japan Standard Time
|
|
8430
|
+
timezone="Australia/Sydney" # Australian Eastern Time
|
|
8431
|
+
timezone="UTC" # Coordinated Universal Time
|
|
8432
|
+
```
|
|
8433
|
+
|
|
8434
|
+
**Simple Hour Offsets** (quick and easy):
|
|
8435
|
+
|
|
8436
|
+
**ISO 8601 Offset Format** (precise, including fractional hours):
|
|
8437
|
+
|
|
8438
|
+
When a timezone is specified:
|
|
8439
|
+
|
|
8440
|
+
- naive datetime values in the column are assumed to be in this timezone.
|
|
8441
|
+
- the reference time (if naive) is assumed to be in this timezone.
|
|
8442
|
+
- the validation report will show times in this timezone.
|
|
8443
|
+
|
|
8444
|
+
When `None` (default):
|
|
8445
|
+
|
|
8446
|
+
- if your column has timezone-aware datetimes, those timezones are used
|
|
8447
|
+
- if your column has naive datetimes, they're treated as UTC
|
|
8448
|
+
- the current time reference uses UTC
|
|
8449
|
+
|
|
8450
|
+
Note that IANA timezone names are preferred when daylight saving time transitions matter, as
|
|
8451
|
+
they automatically handle the offset changes. Fixed offsets like `"-7"` or `"-07:00"` do not
|
|
8452
|
+
account for DST.
|
|
8453
|
+
|
|
8454
|
+
Recommendations for Working with Timestamps
|
|
8455
|
+
-------------------------------------------
|
|
8456
|
+
When working with datetime data, storing timestamps in UTC in your databases is strongly
|
|
8457
|
+
recommended since it provides a consistent reference point regardless of where your data
|
|
8458
|
+
originates or where it's consumed. Using timezone-aware datetimes whenever possible helps
|
|
8459
|
+
avoid ambiguity—when a datetime has an explicit timezone, there's no guessing about what
|
|
8460
|
+
time it actually represents.
|
|
8461
|
+
|
|
8462
|
+
If you're working with naive datetimes (which lack timezone information), always specify the
|
|
8463
|
+
`timezone=` parameter so Pointblank knows how to interpret those values. When providing
|
|
8464
|
+
`reference_time=` as a string, use ISO 8601 format with the timezone offset included (e.g.,
|
|
8465
|
+
`"2024-01-15T10:30:00+00:00"`) to ensure unambiguous parsing. Finally, prefer IANA timezone
|
|
8466
|
+
names (like `"America/New_York"`) over fixed offsets (like `"-05:00"`) when daylight saving
|
|
8467
|
+
time transitions matter, since IANA names automatically handle the twice-yearly offset
|
|
8468
|
+
changes. To see all available IANA timezone names in Python, use
|
|
8469
|
+
`zoneinfo.available_timezones()` from the standard library's `zoneinfo` module.
|
|
8470
|
+
|
|
8471
|
+
Examples
|
|
8472
|
+
--------
|
|
8473
|
+
The simplest use of `data_freshness()` requires just two arguments: the `column=` containing
|
|
8474
|
+
your timestamps and `max_age=` specifying how old the data can be. In this first example,
|
|
8475
|
+
we create sample data with an `"updated_at"` column containing timestamps from 1, 12, and
|
|
8476
|
+
20 hours ago. By setting `max_age="24 hours"`, we're asserting that the most recent
|
|
8477
|
+
timestamp should be within 24 hours of the current time. Since the newest record is only
|
|
8478
|
+
1 hour old, this validation passes.
|
|
8479
|
+
|
|
8480
|
+
```python
|
|
8481
|
+
import pointblank as pb
|
|
8482
|
+
import polars as pl
|
|
8483
|
+
from datetime import datetime, timedelta
|
|
8484
|
+
|
|
8485
|
+
# Create sample data with recent timestamps
|
|
8486
|
+
recent_data = pl.DataFrame({
|
|
8487
|
+
"id": [1, 2, 3],
|
|
8488
|
+
"updated_at": [
|
|
8489
|
+
datetime.now() - timedelta(hours=1),
|
|
8490
|
+
datetime.now() - timedelta(hours=12),
|
|
8491
|
+
datetime.now() - timedelta(hours=20),
|
|
8492
|
+
]
|
|
8493
|
+
})
|
|
8494
|
+
|
|
8495
|
+
validation = (
|
|
8496
|
+
pb.Validate(data=recent_data)
|
|
8497
|
+
.data_freshness(column="updated_at", max_age="24 hours")
|
|
8498
|
+
.interrogate()
|
|
8499
|
+
)
|
|
8500
|
+
|
|
8501
|
+
validation
|
|
8502
|
+
```
|
|
8503
|
+
|
|
8504
|
+
The `max_age=` parameter accepts human-readable strings with various time units. You can
|
|
8505
|
+
chain multiple `data_freshness()` calls to check different freshness thresholds
|
|
8506
|
+
simultaneously—useful for tiered SLAs where you might want warnings at 30 minutes but
|
|
8507
|
+
errors at 2 days.
|
|
8508
|
+
|
|
8509
|
+
```python
|
|
8510
|
+
# Check data is fresh within different time windows
|
|
8511
|
+
validation = (
|
|
8512
|
+
pb.Validate(data=recent_data)
|
|
8513
|
+
.data_freshness(column="updated_at", max_age="30 minutes") # Very fresh
|
|
8514
|
+
.data_freshness(column="updated_at", max_age="2 days") # Reasonably fresh
|
|
8515
|
+
.data_freshness(column="updated_at", max_age="1 week") # Within a week
|
|
8516
|
+
.interrogate()
|
|
8517
|
+
)
|
|
8518
|
+
|
|
8519
|
+
validation
|
|
8520
|
+
```
|
|
8521
|
+
|
|
8522
|
+
When your data contains naive datetimes (timestamps without timezone information), use the
|
|
8523
|
+
`timezone=` parameter to specify what timezone those values represent. Here we have event
|
|
8524
|
+
data recorded in Eastern Time, so we set `timezone="America/New_York"` to ensure the
|
|
8525
|
+
freshness comparison is done correctly.
|
|
8526
|
+
|
|
8527
|
+
```python
|
|
8528
|
+
# Data with naive datetimes (assume they're in Eastern Time)
|
|
8529
|
+
eastern_data = pl.DataFrame({
|
|
8530
|
+
"event_time": [
|
|
8531
|
+
datetime.now() - timedelta(hours=2),
|
|
8532
|
+
datetime.now() - timedelta(hours=5),
|
|
8533
|
+
]
|
|
8534
|
+
})
|
|
8535
|
+
|
|
8536
|
+
validation = (
|
|
8537
|
+
pb.Validate(data=eastern_data)
|
|
8538
|
+
.data_freshness(
|
|
8539
|
+
column="event_time",
|
|
8540
|
+
max_age="12 hours",
|
|
8541
|
+
timezone="America/New_York" # Interpret times as Eastern
|
|
8542
|
+
)
|
|
8543
|
+
.interrogate()
|
|
8544
|
+
)
|
|
8545
|
+
|
|
8546
|
+
validation
|
|
8547
|
+
```
|
|
8548
|
+
|
|
8549
|
+
For reproducible validations or historical checks, you can use `reference_time=` to compare
|
|
8550
|
+
against a specific point in time instead of the current time. This is particularly useful
|
|
8551
|
+
for testing or when validating data snapshots. The reference time should include a timezone
|
|
8552
|
+
offset (like `+00:00` for UTC) to avoid ambiguity.
|
|
8553
|
+
|
|
8554
|
+
```python
|
|
8555
|
+
validation = (
|
|
8556
|
+
pb.Validate(data=recent_data)
|
|
8557
|
+
.data_freshness(
|
|
8558
|
+
column="updated_at",
|
|
8559
|
+
max_age="24 hours",
|
|
8560
|
+
reference_time="2024-01-15T12:00:00+00:00"
|
|
8561
|
+
)
|
|
8562
|
+
.interrogate()
|
|
8563
|
+
)
|
|
8564
|
+
|
|
8565
|
+
validation
|
|
8566
|
+
```
|
|
8567
|
+
|
|
8568
|
+
|
|
8292
8569
|
col_schema_match(self, schema: 'Schema', complete: 'bool' = True, in_order: 'bool' = True, case_sensitive_colnames: 'bool' = True, case_sensitive_dtypes: 'bool' = True, full_match_dtypes: 'bool' = True, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
8293
8570
|
|
|
8294
8571
|
Do columns in the table (and their types) match a predefined schema?
|
|
@@ -15241,6 +15518,521 @@ config(report_incl_header: 'bool' = True, report_incl_footer: 'bool' = True, rep
|
|
|
15241
15518
|
|
|
15242
15519
|
|
|
15243
15520
|
|
|
15521
|
+
## The Test Data Generation family
|
|
15522
|
+
|
|
15523
|
+
Generate synthetic test data based on schema definitions. Use
|
|
15524
|
+
`generate_dataset()` to create data from a `Schema` object. The helper functions define typed fields
|
|
15525
|
+
with constraints for realistic test data generation.
|
|
15526
|
+
|
|
15527
|
+
generate_dataset(schema: 'Schema', n: 'int' = 100, seed: 'int | None' = None, output: "Literal['polars', 'pandas', 'dict']" = 'polars', country: 'str' = 'US') -> 'Any'
|
|
15528
|
+
|
|
15529
|
+
Generate synthetic test data from a schema.
|
|
15530
|
+
|
|
15531
|
+
This function generates random data that conforms to a schema's column definitions. When the
|
|
15532
|
+
schema is defined using `Field` objects with constraints (e.g., `min_val`, `max_val`,
|
|
15533
|
+
`pattern`, `preset`), the generated data will respect those constraints.
|
|
15534
|
+
|
|
15535
|
+
This is a convenience function that wraps `Schema.generate()` for a more functional style
|
|
15536
|
+
of usage, similar to how `load_dataset()` loads built-in datasets.
|
|
15537
|
+
|
|
15538
|
+
Parameters
|
|
15539
|
+
----------
|
|
15540
|
+
schema
|
|
15541
|
+
The schema object defining the structure and constraints of the data to generate.
|
|
15542
|
+
n
|
|
15543
|
+
Number of rows to generate. Default is `100`.
|
|
15544
|
+
seed
|
|
15545
|
+
Random seed for reproducibility. If provided, the same seed will produce
|
|
15546
|
+
the same data. Default is `None` (non-deterministic).
|
|
15547
|
+
output
|
|
15548
|
+
Output format for the generated data. Options are: (1) `"polars"` (default) returns a
|
|
15549
|
+
Polars DataFrame, (2) `"pandas"` returns a Pandas DataFrame, and (3) `"dict"` returns
|
|
15550
|
+
a dictionary of lists.
|
|
15551
|
+
country
|
|
15552
|
+
Country code for realistic data generation when using presets (e.g., `preset="email"`,
|
|
15553
|
+
`preset="address"`). Accepts ISO 3166-1 alpha-2 codes (e.g., `"US"`, `"DE"`, `"FR"`)
|
|
15554
|
+
or alpha-3 codes (e.g., `"USA"`, `"DEU"`, `"FRA"`). Default is `"US"`.
|
|
15555
|
+
|
|
15556
|
+
Returns
|
|
15557
|
+
-------
|
|
15558
|
+
DataFrame or dict
|
|
15559
|
+
Generated data in the requested format.
|
|
15560
|
+
|
|
15561
|
+
Raises
|
|
15562
|
+
------
|
|
15563
|
+
ValueError
|
|
15564
|
+
If the schema has no columns or if constraints cannot be satisfied.
|
|
15565
|
+
ImportError
|
|
15566
|
+
If required optional dependencies are not installed.
|
|
15567
|
+
|
|
15568
|
+
Supported Countries
|
|
15569
|
+
-------------------
|
|
15570
|
+
The `country=` parameter controls the country used for generating realistic data with
|
|
15571
|
+
presets (e.g., `preset="email"`, `preset="address"`). This affects location-specific
|
|
15572
|
+
formats like addresses, phone numbers, and postal codes. Currently, **50 countries** are
|
|
15573
|
+
supported with full locale data:
|
|
15574
|
+
|
|
15575
|
+
**Europe (32 countries):** Austria (`"AT"`), Belgium (`"BE"`), Bulgaria (`"BG"`),
|
|
15576
|
+
Croatia (`"HR"`), Cyprus (`"CY"`), Czech Republic (`"CZ"`), Denmark (`"DK"`),
|
|
15577
|
+
Estonia (`"EE"`), Finland (`"FI"`), France (`"FR"`), Germany (`"DE"`), Greece (`"GR"`),
|
|
15578
|
+
Hungary (`"HU"`), Iceland (`"IS"`), Ireland (`"IE"`), Italy (`"IT"`), Latvia (`"LV"`),
|
|
15579
|
+
Lithuania (`"LT"`), Luxembourg (`"LU"`), Malta (`"MT"`), Netherlands (`"NL"`),
|
|
15580
|
+
Norway (`"NO"`), Poland (`"PL"`), Portugal (`"PT"`), Romania (`"RO"`), Russia (`"RU"`),
|
|
15581
|
+
Slovakia (`"SK"`), Slovenia (`"SI"`), Spain (`"ES"`), Sweden (`"SE"`),
|
|
15582
|
+
Switzerland (`"CH"`), United Kingdom (`"GB"`)
|
|
15583
|
+
|
|
15584
|
+
**Americas (7 countries):** Argentina (`"AR"`), Brazil (`"BR"`), Canada (`"CA"`),
|
|
15585
|
+
Chile (`"CL"`), Colombia (`"CO"`), Mexico (`"MX"`), United States (`"US"`)
|
|
15586
|
+
|
|
15587
|
+
**Asia-Pacific (10 countries):** Australia (`"AU"`), China (`"CN"`), Hong Kong (`"HK"`),
|
|
15588
|
+
India (`"IN"`), Indonesia (`"ID"`), Japan (`"JP"`), New Zealand (`"NZ"`),
|
|
15589
|
+
Philippines (`"PH"`), South Korea (`"KR"`), Taiwan (`"TW"`)
|
|
15590
|
+
|
|
15591
|
+
**Middle East (1 country):** Turkey (`"TR"`)
|
|
15592
|
+
|
|
15593
|
+
Examples
|
|
15594
|
+
--------
|
|
15595
|
+
Generate test data from a schema with field constraints:
|
|
15596
|
+
|
|
15597
|
+
```python
|
|
15598
|
+
import pointblank as pb
|
|
15599
|
+
|
|
15600
|
+
schema = pb.Schema(
|
|
15601
|
+
user_id=pb.int_field(min_val=1, unique=True),
|
|
15602
|
+
email=pb.string_field(preset="email"),
|
|
15603
|
+
age=pb.int_field(min_val=18, max_val=100),
|
|
15604
|
+
status=pb.string_field(allowed=["active", "pending", "inactive"]),
|
|
15605
|
+
)
|
|
15606
|
+
|
|
15607
|
+
# Generate 100 rows of test data
|
|
15608
|
+
pb.preview(pb.generate_dataset(schema, n=100, seed=23))
|
|
15609
|
+
```
|
|
15610
|
+
|
|
15611
|
+
Generate data from a simple dtype-only schema as a Pandas DataFrame:
|
|
15612
|
+
|
|
15613
|
+
```python
|
|
15614
|
+
schema = pb.Schema(name="String", age="Int64", active="Boolean")
|
|
15615
|
+
pb.preview(pb.generate_dataset(schema, n=50, seed=23, output="pandas"))
|
|
15616
|
+
```
|
|
15617
|
+
|
|
15618
|
+
Generate data with German addresses by using `country="DE"`:
|
|
15619
|
+
|
|
15620
|
+
```python
|
|
15621
|
+
schema = pb.Schema(
|
|
15622
|
+
name=pb.string_field(preset="name"),
|
|
15623
|
+
address=pb.string_field(preset="address"),
|
|
15624
|
+
city=pb.string_field(preset="city"),
|
|
15625
|
+
)
|
|
15626
|
+
pb.preview(pb.generate_dataset(schema, n=20, seed=23, country="DE"))
|
|
15627
|
+
```
|
|
15628
|
+
|
|
15629
|
+
|
|
15630
|
+
int_field(min_val: 'int | None' = None, max_val: 'int | None' = None, allowed: 'list[int] | None' = None, nullable: 'bool' = False, null_probability: 'float' = 0.0, unique: 'bool' = False, generator: 'Callable[[], Any] | None' = None, dtype: 'str' = 'Int64') -> 'IntField'
|
|
15631
|
+
|
|
15632
|
+
Create an integer column specification.
|
|
15633
|
+
|
|
15634
|
+
Parameters
|
|
15635
|
+
----------
|
|
15636
|
+
min_val
|
|
15637
|
+
Minimum value (inclusive). Default is `None` (no minimum).
|
|
15638
|
+
max_val
|
|
15639
|
+
Maximum value (inclusive). Default is `None` (no maximum).
|
|
15640
|
+
allowed
|
|
15641
|
+
List of allowed values (categorical constraint). When provided,
|
|
15642
|
+
values are sampled from this list.
|
|
15643
|
+
nullable
|
|
15644
|
+
Whether the column can contain null values. Default is `False`.
|
|
15645
|
+
null_probability
|
|
15646
|
+
Probability of generating null when `nullable=True`. Default is `0.0`.
|
|
15647
|
+
unique
|
|
15648
|
+
Whether all values must be unique. Default is `False`.
|
|
15649
|
+
generator
|
|
15650
|
+
Custom callable that generates values. Overrides other settings.
|
|
15651
|
+
dtype
|
|
15652
|
+
Integer dtype. Default is `"Int64"`. Options: `"Int8"`, `"Int16"`,
|
|
15653
|
+
`"Int32"`, `"Int64"`, `"UInt8"`, `"UInt16"`, `"UInt32"`, `"UInt64"`.
|
|
15654
|
+
|
|
15655
|
+
Returns
|
|
15656
|
+
-------
|
|
15657
|
+
IntField
|
|
15658
|
+
An integer field specification.
|
|
15659
|
+
|
|
15660
|
+
Examples
|
|
15661
|
+
--------
|
|
15662
|
+
Define a schema with integer fields and generate test data:
|
|
15663
|
+
|
|
15664
|
+
```python
|
|
15665
|
+
import pointblank as pb
|
|
15666
|
+
|
|
15667
|
+
# Define a schema with integer field specifications
|
|
15668
|
+
schema = pb.Schema(
|
|
15669
|
+
user_id=pb.int_field(min_val=1, unique=True),
|
|
15670
|
+
age=pb.int_field(min_val=0, max_val=120),
|
|
15671
|
+
rating=pb.int_field(allowed=[1, 2, 3, 4, 5]),
|
|
15672
|
+
)
|
|
15673
|
+
|
|
15674
|
+
# Generate 100 rows of test data
|
|
15675
|
+
pb.preview(pb.generate_dataset(schema, n=100, seed=23))
|
|
15676
|
+
```
|
|
15677
|
+
|
|
15678
|
+
The generated data will have unique user IDs starting from `1`, ages between `0`-`120`,
|
|
15679
|
+
and ratings sampled from the allowed values.
|
|
15680
|
+
|
|
15681
|
+
|
|
15682
|
+
float_field(min_val: 'float | None' = None, max_val: 'float | None' = None, allowed: 'list[float] | None' = None, nullable: 'bool' = False, null_probability: 'float' = 0.0, unique: 'bool' = False, generator: 'Callable[[], Any] | None' = None, dtype: 'str' = 'Float64') -> 'FloatField'
|
|
15683
|
+
|
|
15684
|
+
Create a floating-point column specification.
|
|
15685
|
+
|
|
15686
|
+
Parameters
|
|
15687
|
+
----------
|
|
15688
|
+
min_val
|
|
15689
|
+
Minimum value (inclusive). Default is `None` (no minimum).
|
|
15690
|
+
max_val
|
|
15691
|
+
Maximum value (inclusive). Default is `None` (no maximum).
|
|
15692
|
+
allowed
|
|
15693
|
+
List of allowed values (categorical constraint). When provided,
|
|
15694
|
+
values are sampled from this list.
|
|
15695
|
+
nullable
|
|
15696
|
+
Whether the column can contain null values. Default is `False`.
|
|
15697
|
+
null_probability
|
|
15698
|
+
Probability of generating null when `nullable=True`. Default is `0.0`.
|
|
15699
|
+
unique
|
|
15700
|
+
Whether all values must be unique. Default is `False`.
|
|
15701
|
+
generator
|
|
15702
|
+
Custom callable that generates values. Overrides other settings.
|
|
15703
|
+
dtype
|
|
15704
|
+
Float dtype. Default is `"Float64"`. Options: `"Float32"`, `"Float64"`.
|
|
15705
|
+
|
|
15706
|
+
Returns
|
|
15707
|
+
-------
|
|
15708
|
+
FloatField
|
|
15709
|
+
A float field specification.
|
|
15710
|
+
|
|
15711
|
+
Examples
|
|
15712
|
+
--------
|
|
15713
|
+
Define a schema with float fields and generate test data:
|
|
15714
|
+
|
|
15715
|
+
```python
|
|
15716
|
+
import pointblank as pb
|
|
15717
|
+
|
|
15718
|
+
# Define a schema with float field specifications
|
|
15719
|
+
schema = pb.Schema(
|
|
15720
|
+
price=pb.float_field(min_val=0.01, max_val=9999.99),
|
|
15721
|
+
probability=pb.float_field(min_val=0.0, max_val=1.0),
|
|
15722
|
+
temperature=pb.float_field(min_val=-40.0, max_val=50.0),
|
|
15723
|
+
)
|
|
15724
|
+
|
|
15725
|
+
# Generate 100 rows of test data
|
|
15726
|
+
pb.preview(pb.generate_dataset(schema, n=100, seed=23))
|
|
15727
|
+
```
|
|
15728
|
+
|
|
15729
|
+
Values are uniformly distributed across the specified ranges.
|
|
15730
|
+
|
|
15731
|
+
|
|
15732
|
+
string_field(min_length: 'int | None' = None, max_length: 'int | None' = None, pattern: 'str | None' = None, preset: 'str | None' = None, allowed: 'list[str] | None' = None, nullable: 'bool' = False, null_probability: 'float' = 0.0, unique: 'bool' = False, generator: 'Callable[[], Any] | None' = None) -> 'StringField'
|
|
15733
|
+
|
|
15734
|
+
Create a string column specification.
|
|
15735
|
+
|
|
15736
|
+
Parameters
|
|
15737
|
+
----------
|
|
15738
|
+
min_length
|
|
15739
|
+
Minimum string length. Default is `None` (no minimum).
|
|
15740
|
+
max_length
|
|
15741
|
+
Maximum string length. Default is `None` (no maximum).
|
|
15742
|
+
pattern
|
|
15743
|
+
Regular expression pattern for generated strings.
|
|
15744
|
+
preset
|
|
15745
|
+
Preset for realistic data (e.g., `"email"`, `"name"`, `"phone_number"`).
|
|
15746
|
+
allowed
|
|
15747
|
+
List of allowed values (categorical constraint).
|
|
15748
|
+
nullable
|
|
15749
|
+
Whether the column can contain null values. Default is `False`.
|
|
15750
|
+
null_probability
|
|
15751
|
+
Probability of generating null when `nullable=True`. Default is `0.0`.
|
|
15752
|
+
unique
|
|
15753
|
+
Whether all values must be unique. Default is `False`.
|
|
15754
|
+
generator
|
|
15755
|
+
Custom callable that generates values. Overrides other settings.
|
|
15756
|
+
|
|
15757
|
+
Returns
|
|
15758
|
+
-------
|
|
15759
|
+
StringField
|
|
15760
|
+
A string field specification.
|
|
15761
|
+
|
|
15762
|
+
Examples
|
|
15763
|
+
--------
|
|
15764
|
+
Define a schema with string fields and generate test data:
|
|
15765
|
+
|
|
15766
|
+
```python
|
|
15767
|
+
import pointblank as pb
|
|
15768
|
+
|
|
15769
|
+
# Define a schema with string field specifications
|
|
15770
|
+
schema = pb.Schema(
|
|
15771
|
+
name=pb.string_field(preset="name"),
|
|
15772
|
+
email=pb.string_field(preset="email", unique=True),
|
|
15773
|
+
status=pb.string_field(allowed=["active", "pending", "inactive"]),
|
|
15774
|
+
code=pb.string_field(pattern=r"[A-Z]{3}-[0-9]{4}"),
|
|
15775
|
+
)
|
|
15776
|
+
|
|
15777
|
+
# Generate 100 rows of test data
|
|
15778
|
+
pb.preview(pb.generate_dataset(schema, n=100, seed=23))
|
|
15779
|
+
```
|
|
15780
|
+
|
|
15781
|
+
The generated data will have coherent names and emails (derived from the name),
|
|
15782
|
+
statuses sampled from the allowed values, and codes matching the regex pattern.
|
|
15783
|
+
|
|
15784
|
+
|
|
15785
|
+
bool_field(p_true: 'float' = 0.5, nullable: 'bool' = False, null_probability: 'float' = 0.0, unique: 'bool' = False, generator: 'Callable[[], Any] | None' = None) -> 'BoolField'
|
|
15786
|
+
|
|
15787
|
+
Create a boolean column specification.
|
|
15788
|
+
|
|
15789
|
+
Parameters
|
|
15790
|
+
----------
|
|
15791
|
+
p_true
|
|
15792
|
+
Probability of generating `True`. Default is `0.5` (equal probability).
|
|
15793
|
+
Must be between 0.0 and 1.0.
|
|
15794
|
+
nullable
|
|
15795
|
+
Whether the column can contain null values. Default is `False`.
|
|
15796
|
+
null_probability
|
|
15797
|
+
Probability of generating null when `nullable=True`. Default is `0.0`.
|
|
15798
|
+
unique
|
|
15799
|
+
Whether all values must be unique. Default is `False`.
|
|
15800
|
+
Note: Boolean can only have 2 unique non-null values.
|
|
15801
|
+
generator
|
|
15802
|
+
Custom callable that generates values. Overrides other settings.
|
|
15803
|
+
|
|
15804
|
+
Returns
|
|
15805
|
+
-------
|
|
15806
|
+
BoolField
|
|
15807
|
+
A boolean field specification.
|
|
15808
|
+
|
|
15809
|
+
Examples
|
|
15810
|
+
--------
|
|
15811
|
+
Define a schema with boolean fields and generate test data:
|
|
15812
|
+
|
|
15813
|
+
```python
|
|
15814
|
+
import pointblank as pb
|
|
15815
|
+
|
|
15816
|
+
# Define a schema with boolean field specifications
|
|
15817
|
+
schema = pb.Schema(
|
|
15818
|
+
is_active=pb.bool_field(p_true=0.8), # 80% True
|
|
15819
|
+
is_premium=pb.bool_field(p_true=0.2), # 20% True
|
|
15820
|
+
is_verified=pb.bool_field(), # 50% True (default)
|
|
15821
|
+
)
|
|
15822
|
+
|
|
15823
|
+
# Generate 100 rows of test data
|
|
15824
|
+
pb.preview(pb.generate_dataset(schema, n=100, seed=23))
|
|
15825
|
+
```
|
|
15826
|
+
|
|
15827
|
+
The `p_true=` parameter controls the probability of generating `True` values,
|
|
15828
|
+
which is helpful for simulating real-world distributions.
|
|
15829
|
+
|
|
15830
|
+
|
|
15831
|
+
date_field(min_date: 'str | date | None' = None, max_date: 'str | date | None' = None, nullable: 'bool' = False, null_probability: 'float' = 0.0, unique: 'bool' = False, generator: 'Callable[[], Any] | None' = None) -> 'DateField'
|
|
15832
|
+
|
|
15833
|
+
Create a date column specification.
|
|
15834
|
+
|
|
15835
|
+
Parameters
|
|
15836
|
+
----------
|
|
15837
|
+
min_date
|
|
15838
|
+
Minimum date (inclusive). Can be ISO string or `date` object.
|
|
15839
|
+
max_date
|
|
15840
|
+
Maximum date (inclusive). Can be ISO string or `date` object.
|
|
15841
|
+
nullable
|
|
15842
|
+
Whether the column can contain null values. Default is `False`.
|
|
15843
|
+
null_probability
|
|
15844
|
+
Probability of generating null when `nullable=True`. Default is `0.0`.
|
|
15845
|
+
unique
|
|
15846
|
+
Whether all values must be unique. Default is `False`.
|
|
15847
|
+
generator
|
|
15848
|
+
Custom callable that generates values. Overrides other settings.
|
|
15849
|
+
|
|
15850
|
+
Returns
|
|
15851
|
+
-------
|
|
15852
|
+
DateField
|
|
15853
|
+
A date field specification.
|
|
15854
|
+
|
|
15855
|
+
Examples
|
|
15856
|
+
--------
|
|
15857
|
+
Define a schema with date fields and generate test data:
|
|
15858
|
+
|
|
15859
|
+
```python
|
|
15860
|
+
import pointblank as pb
|
|
15861
|
+
from datetime import date
|
|
15862
|
+
|
|
15863
|
+
# Define a schema with date field specifications
|
|
15864
|
+
schema = pb.Schema(
|
|
15865
|
+
birth_date=pb.date_field(
|
|
15866
|
+
min_date=date(1960, 1, 1),
|
|
15867
|
+
max_date=date(2005, 12, 31)
|
|
15868
|
+
),
|
|
15869
|
+
hire_date=pb.date_field(
|
|
15870
|
+
min_date=date(2020, 1, 1),
|
|
15871
|
+
max_date=date(2024, 12, 31)
|
|
15872
|
+
),
|
|
15873
|
+
)
|
|
15874
|
+
|
|
15875
|
+
# Generate 100 rows of test data
|
|
15876
|
+
pb.preview(pb.generate_dataset(schema, n=100, seed=23))
|
|
15877
|
+
```
|
|
15878
|
+
|
|
15879
|
+
Date values are uniformly distributed within the specified range.
|
|
15880
|
+
|
|
15881
|
+
|
|
15882
|
+
datetime_field(min_date: 'str | datetime | None' = None, max_date: 'str | datetime | None' = None, nullable: 'bool' = False, null_probability: 'float' = 0.0, unique: 'bool' = False, generator: 'Callable[[], Any] | None' = None) -> 'DatetimeField'
|
|
15883
|
+
|
|
15884
|
+
Create a datetime column specification.
|
|
15885
|
+
|
|
15886
|
+
Parameters
|
|
15887
|
+
----------
|
|
15888
|
+
min_date
|
|
15889
|
+
Minimum datetime (inclusive). Can be ISO string or `datetime` object.
|
|
15890
|
+
max_date
|
|
15891
|
+
Maximum datetime (inclusive). Can be ISO string or `datetime` object.
|
|
15892
|
+
nullable
|
|
15893
|
+
Whether the column can contain null values. Default is `False`.
|
|
15894
|
+
null_probability
|
|
15895
|
+
Probability of generating null when `nullable=True`. Default is `0.0`.
|
|
15896
|
+
unique
|
|
15897
|
+
Whether all values must be unique. Default is `False`.
|
|
15898
|
+
generator
|
|
15899
|
+
Custom callable that generates values. Overrides other settings.
|
|
15900
|
+
|
|
15901
|
+
Returns
|
|
15902
|
+
-------
|
|
15903
|
+
DatetimeField
|
|
15904
|
+
A datetime field specification.
|
|
15905
|
+
|
|
15906
|
+
Examples
|
|
15907
|
+
--------
|
|
15908
|
+
Define a schema with datetime fields and generate test data:
|
|
15909
|
+
|
|
15910
|
+
```python
|
|
15911
|
+
import pointblank as pb
|
|
15912
|
+
from datetime import datetime
|
|
15913
|
+
|
|
15914
|
+
# Define a schema with datetime field specifications
|
|
15915
|
+
schema = pb.Schema(
|
|
15916
|
+
created_at=pb.datetime_field(
|
|
15917
|
+
min_date=datetime(2024, 1, 1),
|
|
15918
|
+
max_date=datetime(2024, 12, 31)
|
|
15919
|
+
),
|
|
15920
|
+
updated_at=pb.datetime_field(
|
|
15921
|
+
min_date=datetime(2024, 6, 1),
|
|
15922
|
+
max_date=datetime(2024, 12, 31)
|
|
15923
|
+
),
|
|
15924
|
+
)
|
|
15925
|
+
|
|
15926
|
+
# Generate 100 rows of test data
|
|
15927
|
+
pb.preview(pb.generate_dataset(schema, n=100, seed=23))
|
|
15928
|
+
```
|
|
15929
|
+
|
|
15930
|
+
Datetime values are uniformly distributed within the specified range.
|
|
15931
|
+
|
|
15932
|
+
|
|
15933
|
+
time_field(min_time: 'str | time | None' = None, max_time: 'str | time | None' = None, nullable: 'bool' = False, null_probability: 'float' = 0.0, unique: 'bool' = False, generator: 'Callable[[], Any] | None' = None) -> 'TimeField'
|
|
15934
|
+
|
|
15935
|
+
Create a time column specification.
|
|
15936
|
+
|
|
15937
|
+
Parameters
|
|
15938
|
+
----------
|
|
15939
|
+
min_time
|
|
15940
|
+
Minimum time (inclusive). Can be ISO string or `time` object.
|
|
15941
|
+
max_time
|
|
15942
|
+
Maximum time (inclusive). Can be ISO string or `time` object.
|
|
15943
|
+
nullable
|
|
15944
|
+
Whether the column can contain null values. Default is `False`.
|
|
15945
|
+
null_probability
|
|
15946
|
+
Probability of generating null when `nullable=True`. Default is `0.0`.
|
|
15947
|
+
unique
|
|
15948
|
+
Whether all values must be unique. Default is `False`.
|
|
15949
|
+
generator
|
|
15950
|
+
Custom callable that generates values. Overrides other settings.
|
|
15951
|
+
|
|
15952
|
+
Returns
|
|
15953
|
+
-------
|
|
15954
|
+
TimeField
|
|
15955
|
+
A time field specification.
|
|
15956
|
+
|
|
15957
|
+
Examples
|
|
15958
|
+
--------
|
|
15959
|
+
Define a schema with time fields and generate test data:
|
|
15960
|
+
|
|
15961
|
+
```python
|
|
15962
|
+
import pointblank as pb
|
|
15963
|
+
from datetime import time
|
|
15964
|
+
|
|
15965
|
+
# Define a schema with time field specifications
|
|
15966
|
+
schema = pb.Schema(
|
|
15967
|
+
start_time=pb.time_field(
|
|
15968
|
+
min_time=time(9, 0, 0),
|
|
15969
|
+
max_time=time(12, 0, 0)
|
|
15970
|
+
),
|
|
15971
|
+
end_time=pb.time_field(
|
|
15972
|
+
min_time=time(13, 0, 0),
|
|
15973
|
+
max_time=time(17, 0, 0)
|
|
15974
|
+
),
|
|
15975
|
+
)
|
|
15976
|
+
|
|
15977
|
+
# Generate 100 rows of test data
|
|
15978
|
+
pb.preview(pb.generate_dataset(schema, n=100, seed=23))
|
|
15979
|
+
```
|
|
15980
|
+
|
|
15981
|
+
Time values are uniformly distributed within the specified range.
|
|
15982
|
+
|
|
15983
|
+
|
|
15984
|
+
duration_field(min_duration: 'str | timedelta | None' = None, max_duration: 'str | timedelta | None' = None, nullable: 'bool' = False, null_probability: 'float' = 0.0, unique: 'bool' = False, generator: 'Callable[[], Any] | None' = None) -> 'DurationField'
|
|
15985
|
+
|
|
15986
|
+
Create a duration column specification.
|
|
15987
|
+
|
|
15988
|
+
Parameters
|
|
15989
|
+
----------
|
|
15990
|
+
min_duration
|
|
15991
|
+
Minimum duration (inclusive). Can be string or `timedelta` object.
|
|
15992
|
+
max_duration
|
|
15993
|
+
Maximum duration (inclusive). Can be string or `timedelta` object.
|
|
15994
|
+
nullable
|
|
15995
|
+
Whether the column can contain null values. Default is `False`.
|
|
15996
|
+
null_probability
|
|
15997
|
+
Probability of generating null when `nullable=True`. Default is `0.0`.
|
|
15998
|
+
unique
|
|
15999
|
+
Whether all values must be unique. Default is `False`.
|
|
16000
|
+
generator
|
|
16001
|
+
Custom callable that generates values. Overrides other settings.
|
|
16002
|
+
|
|
16003
|
+
Returns
|
|
16004
|
+
-------
|
|
16005
|
+
DurationField
|
|
16006
|
+
A duration field specification.
|
|
16007
|
+
|
|
16008
|
+
Examples
|
|
16009
|
+
--------
|
|
16010
|
+
Define a schema with duration fields and generate test data:
|
|
16011
|
+
|
|
16012
|
+
```python
|
|
16013
|
+
import pointblank as pb
|
|
16014
|
+
from datetime import timedelta
|
|
16015
|
+
|
|
16016
|
+
# Define a schema with duration field specifications
|
|
16017
|
+
schema = pb.Schema(
|
|
16018
|
+
session_length=pb.duration_field(
|
|
16019
|
+
min_duration=timedelta(minutes=5),
|
|
16020
|
+
max_duration=timedelta(hours=2)
|
|
16021
|
+
),
|
|
16022
|
+
wait_time=pb.duration_field(
|
|
16023
|
+
min_duration=timedelta(seconds=30),
|
|
16024
|
+
max_duration=timedelta(minutes=15)
|
|
16025
|
+
),
|
|
16026
|
+
)
|
|
16027
|
+
|
|
16028
|
+
# Generate 100 rows of test data
|
|
16029
|
+
pb.generate_dataset(schema, n=100, seed=23)
|
|
16030
|
+
```
|
|
16031
|
+
|
|
16032
|
+
Duration values are uniformly distributed within the specified range.
|
|
16033
|
+
|
|
16034
|
+
|
|
16035
|
+
|
|
15244
16036
|
## The Prebuilt Actions family
|
|
15245
16037
|
|
|
15246
16038
|
The Prebuilt Actions group contains a function that can be used to
|