pointblank 0.13.4__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +4 -0
- pointblank/_constants.py +54 -0
- pointblank/_constants_translations.py +487 -2
- pointblank/_interrogation.py +182 -11
- pointblank/_utils.py +3 -3
- pointblank/_utils_ai.py +850 -0
- pointblank/cli.py +128 -115
- pointblank/column.py +1 -1
- pointblank/data/api-docs.txt +198 -13
- pointblank/data/validations/README.md +108 -0
- pointblank/data/validations/complex_preprocessing.json +54 -0
- pointblank/data/validations/complex_preprocessing.pkl +0 -0
- pointblank/data/validations/generate_test_files.py +127 -0
- pointblank/data/validations/multiple_steps.json +83 -0
- pointblank/data/validations/multiple_steps.pkl +0 -0
- pointblank/data/validations/narwhals_function.json +28 -0
- pointblank/data/validations/narwhals_function.pkl +0 -0
- pointblank/data/validations/no_preprocessing.json +83 -0
- pointblank/data/validations/no_preprocessing.pkl +0 -0
- pointblank/data/validations/pandas_compatible.json +28 -0
- pointblank/data/validations/pandas_compatible.pkl +0 -0
- pointblank/data/validations/preprocessing_functions.py +46 -0
- pointblank/data/validations/simple_preprocessing.json +57 -0
- pointblank/data/validations/simple_preprocessing.pkl +0 -0
- pointblank/datascan.py +4 -4
- pointblank/scan_profile.py +6 -6
- pointblank/schema.py +8 -82
- pointblank/thresholds.py +1 -1
- pointblank/validate.py +1233 -12
- {pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/METADATA +66 -8
- pointblank-0.14.0.dist-info/RECORD +55 -0
- pointblank-0.13.4.dist-info/RECORD +0 -39
- {pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/WHEEL +0 -0
- {pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/entry_points.txt +0 -0
- {pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"i": 1,
|
|
4
|
+
"i_o": 1,
|
|
5
|
+
"assertion_type": "col_vals_gt",
|
|
6
|
+
"column": "a",
|
|
7
|
+
"values": 2,
|
|
8
|
+
"inclusive": null,
|
|
9
|
+
"na_pass": false,
|
|
10
|
+
"pre": "def double_column_a(df):\n \"\"\"Double the values in column 'a'.\"\"\"\n return df.with_columns(pl.col(\"a\") * 2)",
|
|
11
|
+
"segments": null,
|
|
12
|
+
"thresholds": "Thresholds(warning=None, error=None, critical=None)",
|
|
13
|
+
"label": null,
|
|
14
|
+
"brief": null,
|
|
15
|
+
"active": true,
|
|
16
|
+
"all_passed": false,
|
|
17
|
+
"n": 10,
|
|
18
|
+
"n_passed": 9,
|
|
19
|
+
"n_failed": 1,
|
|
20
|
+
"f_passed": 0.9,
|
|
21
|
+
"f_failed": 0.1,
|
|
22
|
+
"warning": null,
|
|
23
|
+
"error": null,
|
|
24
|
+
"critical": null,
|
|
25
|
+
"time_processed": "2025-10-02T04:16:44.712+00:00",
|
|
26
|
+
"proc_duration_s": 0.00152
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
"i": 2,
|
|
30
|
+
"i_o": 2,
|
|
31
|
+
"assertion_type": "col_vals_in_set",
|
|
32
|
+
"column": "c",
|
|
33
|
+
"values": [
|
|
34
|
+
"x",
|
|
35
|
+
"y"
|
|
36
|
+
],
|
|
37
|
+
"inclusive": null,
|
|
38
|
+
"na_pass": null,
|
|
39
|
+
"pre": "def filter_by_d_gt_100(df):\n \"\"\"Filter rows where column 'd' is greater than 100.\"\"\"\n return df.filter(pl.col(\"d\") > 100)",
|
|
40
|
+
"segments": null,
|
|
41
|
+
"thresholds": "Thresholds(warning=None, error=None, critical=None)",
|
|
42
|
+
"label": null,
|
|
43
|
+
"brief": null,
|
|
44
|
+
"active": true,
|
|
45
|
+
"all_passed": true,
|
|
46
|
+
"n": 7,
|
|
47
|
+
"n_passed": 7,
|
|
48
|
+
"n_failed": 0,
|
|
49
|
+
"f_passed": 1.0,
|
|
50
|
+
"f_failed": 0.0,
|
|
51
|
+
"warning": null,
|
|
52
|
+
"error": null,
|
|
53
|
+
"critical": null,
|
|
54
|
+
"time_processed": "2025-10-02T04:16:44.713+00:00",
|
|
55
|
+
"proc_duration_s": 0.001
|
|
56
|
+
},
|
|
57
|
+
{
|
|
58
|
+
"i": 3,
|
|
59
|
+
"i_o": 3,
|
|
60
|
+
"assertion_type": "col_vals_gt",
|
|
61
|
+
"column": "sum_ab",
|
|
62
|
+
"values": 100,
|
|
63
|
+
"inclusive": null,
|
|
64
|
+
"na_pass": false,
|
|
65
|
+
"pre": "def add_computed_column(df):\n \"\"\"Add a computed column based on existing columns.\"\"\"\n return df.with_columns((pl.col(\"a\") + pl.col(\"b\")).alias(\"sum_ab\"))",
|
|
66
|
+
"segments": null,
|
|
67
|
+
"thresholds": "Thresholds(warning=None, error=None, critical=None)",
|
|
68
|
+
"label": null,
|
|
69
|
+
"brief": null,
|
|
70
|
+
"active": true,
|
|
71
|
+
"all_passed": false,
|
|
72
|
+
"n": 10,
|
|
73
|
+
"n_passed": 1,
|
|
74
|
+
"n_failed": 9,
|
|
75
|
+
"f_passed": 0.1,
|
|
76
|
+
"f_failed": 0.9,
|
|
77
|
+
"warning": null,
|
|
78
|
+
"error": null,
|
|
79
|
+
"critical": null,
|
|
80
|
+
"time_processed": "2025-10-02T04:16:44.714+00:00",
|
|
81
|
+
"proc_duration_s": 0.001464
|
|
82
|
+
}
|
|
83
|
+
]
|
|
Binary file
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"i": 1,
|
|
4
|
+
"i_o": 1,
|
|
5
|
+
"assertion_type": "col_vals_gt",
|
|
6
|
+
"column": "a",
|
|
7
|
+
"values": 5,
|
|
8
|
+
"inclusive": null,
|
|
9
|
+
"na_pass": false,
|
|
10
|
+
"pre": "def narwhals_median_transform(df):\n \"\"\"Use narwhals to compute median - cross-backend compatible.\"\"\"\n return nw.from_native(df).select(nw.median(\"a\"), nw.median(\"d\"))",
|
|
11
|
+
"segments": null,
|
|
12
|
+
"thresholds": "Thresholds(warning=None, error=None, critical=None)",
|
|
13
|
+
"label": null,
|
|
14
|
+
"brief": null,
|
|
15
|
+
"active": true,
|
|
16
|
+
"all_passed": true,
|
|
17
|
+
"n": 1,
|
|
18
|
+
"n_passed": 1,
|
|
19
|
+
"n_failed": 0,
|
|
20
|
+
"f_passed": 1.0,
|
|
21
|
+
"f_failed": 0.0,
|
|
22
|
+
"warning": null,
|
|
23
|
+
"error": null,
|
|
24
|
+
"critical": null,
|
|
25
|
+
"time_processed": "2025-10-02T04:16:44.710+00:00",
|
|
26
|
+
"proc_duration_s": 0.001455
|
|
27
|
+
}
|
|
28
|
+
]
|
|
Binary file
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"i": 1,
|
|
4
|
+
"i_o": 1,
|
|
5
|
+
"assertion_type": "col_vals_gt",
|
|
6
|
+
"column": "a",
|
|
7
|
+
"values": 0,
|
|
8
|
+
"inclusive": null,
|
|
9
|
+
"na_pass": false,
|
|
10
|
+
"pre": null,
|
|
11
|
+
"segments": null,
|
|
12
|
+
"thresholds": "Thresholds(warning=None, error=None, critical=None)",
|
|
13
|
+
"label": null,
|
|
14
|
+
"brief": null,
|
|
15
|
+
"active": true,
|
|
16
|
+
"all_passed": true,
|
|
17
|
+
"n": 10,
|
|
18
|
+
"n_passed": 10,
|
|
19
|
+
"n_failed": 0,
|
|
20
|
+
"f_passed": 1.0,
|
|
21
|
+
"f_failed": 0.0,
|
|
22
|
+
"warning": null,
|
|
23
|
+
"error": null,
|
|
24
|
+
"critical": null,
|
|
25
|
+
"time_processed": "2025-10-02T04:16:44.718+00:00",
|
|
26
|
+
"proc_duration_s": 0.001148
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
"i": 2,
|
|
30
|
+
"i_o": 2,
|
|
31
|
+
"assertion_type": "col_vals_lt",
|
|
32
|
+
"column": "d",
|
|
33
|
+
"values": 300,
|
|
34
|
+
"inclusive": null,
|
|
35
|
+
"na_pass": false,
|
|
36
|
+
"pre": null,
|
|
37
|
+
"segments": null,
|
|
38
|
+
"thresholds": "Thresholds(warning=None, error=None, critical=None)",
|
|
39
|
+
"label": null,
|
|
40
|
+
"brief": null,
|
|
41
|
+
"active": true,
|
|
42
|
+
"all_passed": true,
|
|
43
|
+
"n": 10,
|
|
44
|
+
"n_passed": 10,
|
|
45
|
+
"n_failed": 0,
|
|
46
|
+
"f_passed": 1.0,
|
|
47
|
+
"f_failed": 0.0,
|
|
48
|
+
"warning": null,
|
|
49
|
+
"error": null,
|
|
50
|
+
"critical": null,
|
|
51
|
+
"time_processed": "2025-10-02T04:16:44.719+00:00",
|
|
52
|
+
"proc_duration_s": 0.001181
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
"i": 3,
|
|
56
|
+
"i_o": 3,
|
|
57
|
+
"assertion_type": "col_vals_in_set",
|
|
58
|
+
"column": "c",
|
|
59
|
+
"values": [
|
|
60
|
+
"x",
|
|
61
|
+
"y"
|
|
62
|
+
],
|
|
63
|
+
"inclusive": null,
|
|
64
|
+
"na_pass": null,
|
|
65
|
+
"pre": null,
|
|
66
|
+
"segments": null,
|
|
67
|
+
"thresholds": "Thresholds(warning=None, error=None, critical=None)",
|
|
68
|
+
"label": null,
|
|
69
|
+
"brief": null,
|
|
70
|
+
"active": true,
|
|
71
|
+
"all_passed": true,
|
|
72
|
+
"n": 10,
|
|
73
|
+
"n_passed": 10,
|
|
74
|
+
"n_failed": 0,
|
|
75
|
+
"f_passed": 1.0,
|
|
76
|
+
"f_failed": 0.0,
|
|
77
|
+
"warning": null,
|
|
78
|
+
"error": null,
|
|
79
|
+
"critical": null,
|
|
80
|
+
"time_processed": "2025-10-02T04:16:44.720+00:00",
|
|
81
|
+
"proc_duration_s": 0.000892
|
|
82
|
+
}
|
|
83
|
+
]
|
|
Binary file
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"i": 1,
|
|
4
|
+
"i_o": 1,
|
|
5
|
+
"assertion_type": "col_vals_gt",
|
|
6
|
+
"column": "a_plus_b",
|
|
7
|
+
"values": 10,
|
|
8
|
+
"inclusive": null,
|
|
9
|
+
"na_pass": false,
|
|
10
|
+
"pre": "def pandas_compatible_transform(df):\n \"\"\"Transform that works with pandas DataFrames.\"\"\"\n if hasattr(df, \"assign\"): # pandas\n return df.assign(a_plus_b=df[\"a\"] + df.get(\"b\", 0))\n else: # polars or other\n return df.with_columns((pl.col(\"a\") + pl.col(\"b\")).alias(\"a_plus_b\"))",
|
|
11
|
+
"segments": null,
|
|
12
|
+
"thresholds": "Thresholds(warning=None, error=None, critical=None)",
|
|
13
|
+
"label": null,
|
|
14
|
+
"brief": null,
|
|
15
|
+
"active": true,
|
|
16
|
+
"all_passed": true,
|
|
17
|
+
"n": 10,
|
|
18
|
+
"n_passed": 10,
|
|
19
|
+
"n_failed": 0,
|
|
20
|
+
"f_passed": 1.0,
|
|
21
|
+
"f_failed": 0.0,
|
|
22
|
+
"warning": null,
|
|
23
|
+
"error": null,
|
|
24
|
+
"critical": null,
|
|
25
|
+
"time_processed": "2025-10-02T04:16:44.717+00:00",
|
|
26
|
+
"proc_duration_s": 0.001428
|
|
27
|
+
}
|
|
28
|
+
]
|
|
Binary file
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Test preprocessing functions for validation serialization examples.
|
|
3
|
+
|
|
4
|
+
These functions are used to create validation objects that can be serialized
|
|
5
|
+
and stored as reference files for regression testing.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import narwhals as nw
|
|
9
|
+
import polars as pl
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def double_column_a(df):
|
|
13
|
+
"""Double the values in column 'a'."""
|
|
14
|
+
return df.with_columns(pl.col("a") * 2)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def add_computed_column(df):
|
|
18
|
+
"""Add a computed column based on existing columns."""
|
|
19
|
+
return df.with_columns((pl.col("a") + pl.col("b")).alias("sum_ab"))
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def filter_by_d_gt_100(df):
|
|
23
|
+
"""Filter rows where column 'd' is greater than 100."""
|
|
24
|
+
return df.filter(pl.col("d") > 100)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def narwhals_median_transform(df):
|
|
28
|
+
"""Use narwhals to compute median - cross-backend compatible."""
|
|
29
|
+
return nw.from_native(df).select(nw.median("a"), nw.median("d"))
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def complex_preprocessing(df):
|
|
33
|
+
"""Complex preprocessing combining multiple operations."""
|
|
34
|
+
return (
|
|
35
|
+
df.filter(pl.col("a") > 1)
|
|
36
|
+
.with_columns((pl.col("a") * 2).alias("a_doubled"), (pl.col("d") / 10).alias("d_scaled"))
|
|
37
|
+
.filter(pl.col("d_scaled") > 10)
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def pandas_compatible_transform(df):
|
|
42
|
+
"""Transform that works with pandas DataFrames."""
|
|
43
|
+
if hasattr(df, "assign"): # pandas
|
|
44
|
+
return df.assign(a_plus_b=df["a"] + df.get("b", 0))
|
|
45
|
+
else: # polars or other
|
|
46
|
+
return df.with_columns((pl.col("a") + pl.col("b")).alias("a_plus_b"))
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"i": 1,
|
|
4
|
+
"i_o": 1,
|
|
5
|
+
"assertion_type": "col_vals_gt",
|
|
6
|
+
"column": "a",
|
|
7
|
+
"values": 0,
|
|
8
|
+
"inclusive": null,
|
|
9
|
+
"na_pass": false,
|
|
10
|
+
"pre": "def double_column_a(df):\n \"\"\"Double the values in column 'a'.\"\"\"\n return df.with_columns(pl.col(\"a\") * 2)",
|
|
11
|
+
"segments": null,
|
|
12
|
+
"thresholds": "Thresholds(warning=None, error=None, critical=None)",
|
|
13
|
+
"label": null,
|
|
14
|
+
"brief": null,
|
|
15
|
+
"active": true,
|
|
16
|
+
"all_passed": true,
|
|
17
|
+
"n": 10,
|
|
18
|
+
"n_passed": 10,
|
|
19
|
+
"n_failed": 0,
|
|
20
|
+
"f_passed": 1.0,
|
|
21
|
+
"f_failed": 0.0,
|
|
22
|
+
"warning": null,
|
|
23
|
+
"error": null,
|
|
24
|
+
"critical": null,
|
|
25
|
+
"time_processed": "2025-10-02T04:16:44.702+00:00",
|
|
26
|
+
"proc_duration_s": 0.00387
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
"i": 2,
|
|
30
|
+
"i_o": 2,
|
|
31
|
+
"assertion_type": "col_vals_in_set",
|
|
32
|
+
"column": "c",
|
|
33
|
+
"values": [
|
|
34
|
+
"x",
|
|
35
|
+
"y"
|
|
36
|
+
],
|
|
37
|
+
"inclusive": null,
|
|
38
|
+
"na_pass": null,
|
|
39
|
+
"pre": null,
|
|
40
|
+
"segments": null,
|
|
41
|
+
"thresholds": "Thresholds(warning=None, error=None, critical=None)",
|
|
42
|
+
"label": null,
|
|
43
|
+
"brief": null,
|
|
44
|
+
"active": true,
|
|
45
|
+
"all_passed": true,
|
|
46
|
+
"n": 10,
|
|
47
|
+
"n_passed": 10,
|
|
48
|
+
"n_failed": 0,
|
|
49
|
+
"f_passed": 1.0,
|
|
50
|
+
"f_failed": 0.0,
|
|
51
|
+
"warning": null,
|
|
52
|
+
"error": null,
|
|
53
|
+
"critical": null,
|
|
54
|
+
"time_processed": "2025-10-02T04:16:44.703+00:00",
|
|
55
|
+
"proc_duration_s": 0.000983
|
|
56
|
+
}
|
|
57
|
+
]
|
|
Binary file
|
pointblank/datascan.py
CHANGED
|
@@ -143,17 +143,17 @@ class DataScan:
|
|
|
143
143
|
for conv_method in valid_conversion_methods:
|
|
144
144
|
try:
|
|
145
145
|
valid_native = getattr(ibis_native, conv_method)()
|
|
146
|
-
except (NotImplementedError, ImportError, ModuleNotFoundError):
|
|
147
|
-
continue
|
|
146
|
+
except (NotImplementedError, ImportError, ModuleNotFoundError): # pragma: no cover
|
|
147
|
+
continue # pragma: no cover
|
|
148
148
|
break
|
|
149
|
-
else:
|
|
149
|
+
else: # pragma: no cover
|
|
150
150
|
msg = (
|
|
151
151
|
"To use `ibis` as input, you must have one of arrow, pandas, polars or numpy "
|
|
152
152
|
"available in the process. Until `ibis` is fully supported by Narwhals, this is "
|
|
153
153
|
"necessary. Additionally, the data must be collected in order to calculate some "
|
|
154
154
|
"structural statistics, which may be performance detrimental."
|
|
155
155
|
)
|
|
156
|
-
raise ImportError(msg)
|
|
156
|
+
raise ImportError(msg) # pragma: no cover
|
|
157
157
|
as_native = nw.from_native(valid_native)
|
|
158
158
|
|
|
159
159
|
self.nw_data: Frame = nw.from_native(as_native)
|
pointblank/scan_profile.py
CHANGED
|
@@ -299,12 +299,12 @@ class _DataProfile: # TODO: feels redundant and weird
|
|
|
299
299
|
# instantiations that require consistent types.
|
|
300
300
|
all_same_type: bool = all(type(v) is first_type for v in values[1:])
|
|
301
301
|
if not all_same_type:
|
|
302
|
-
if strict:
|
|
303
|
-
msg = f"Some types in {key!s} stat are different. Turn off `strict` to bypass."
|
|
304
|
-
raise TypeError(msg)
|
|
305
|
-
for d in cols:
|
|
306
|
-
if key in d:
|
|
307
|
-
d[key] = str(d[key])
|
|
302
|
+
if strict: # pragma: no cover
|
|
303
|
+
msg = f"Some types in {key!s} stat are different. Turn off `strict` to bypass." # pragma: no cover
|
|
304
|
+
raise TypeError(msg) # pragma: no cover
|
|
305
|
+
for d in cols: # pragma: no cover
|
|
306
|
+
if key in d: # pragma: no cover
|
|
307
|
+
d[key] = str(d[key]) # pragma: no cover
|
|
308
308
|
|
|
309
309
|
return nw.from_dict(transpose_dicts(cols), backend=self.implementation)
|
|
310
310
|
|
pointblank/schema.py
CHANGED
|
@@ -343,15 +343,15 @@ class Schema:
|
|
|
343
343
|
schema_dict = {k: str(v) for k, v in schema_dict.items()}
|
|
344
344
|
self.columns = list(schema_dict.items())
|
|
345
345
|
|
|
346
|
-
elif table_type == "pyspark":
|
|
346
|
+
elif table_type == "pyspark": # pragma: no cover
|
|
347
347
|
# Convert PySpark DataFrame to Narwhals to get schema
|
|
348
|
-
nw_df = nw.from_native(self.tbl)
|
|
349
|
-
if _is_lazy_frame(data=nw_df):
|
|
350
|
-
schema_dict = dict(nw_df.collect_schema())
|
|
351
|
-
else:
|
|
352
|
-
schema_dict = dict(nw_df.schema.items())
|
|
353
|
-
schema_dict = {k: str(v) for k, v in schema_dict.items()}
|
|
354
|
-
self.columns = list(schema_dict.items())
|
|
348
|
+
nw_df = nw.from_native(self.tbl) # pragma: no cover
|
|
349
|
+
if _is_lazy_frame(data=nw_df): # pragma: no cover
|
|
350
|
+
schema_dict = dict(nw_df.collect_schema()) # pragma: no cover
|
|
351
|
+
else: # pragma: no cover
|
|
352
|
+
schema_dict = dict(nw_df.schema.items()) # pragma: no cover
|
|
353
|
+
schema_dict = {k: str(v) for k, v in schema_dict.items()} # pragma: no cover
|
|
354
|
+
self.columns = list(schema_dict.items()) # pragma: no cover
|
|
355
355
|
|
|
356
356
|
elif table_type in IBIS_BACKENDS:
|
|
357
357
|
schema_dict = dict(self.tbl.schema().items())
|
|
@@ -888,80 +888,6 @@ def _schema_info_generate_params_dict(
|
|
|
888
888
|
}
|
|
889
889
|
|
|
890
890
|
|
|
891
|
-
def _check_schema_match(
|
|
892
|
-
data_tbl: any,
|
|
893
|
-
schema: Schema,
|
|
894
|
-
complete: bool = True,
|
|
895
|
-
in_order: bool = True,
|
|
896
|
-
case_sensitive_colnames: bool = True,
|
|
897
|
-
case_sensitive_dtypes: bool = True,
|
|
898
|
-
full_match_dtypes: bool = True,
|
|
899
|
-
) -> bool:
|
|
900
|
-
"""
|
|
901
|
-
Check if the schema matches the target table.
|
|
902
|
-
|
|
903
|
-
This function performs schema validation and returns a boolean result.
|
|
904
|
-
|
|
905
|
-
Parameters
|
|
906
|
-
----------
|
|
907
|
-
data_tbl
|
|
908
|
-
The target table to validate.
|
|
909
|
-
schema
|
|
910
|
-
The expected schema.
|
|
911
|
-
complete
|
|
912
|
-
Whether the schema should be complete.
|
|
913
|
-
in_order
|
|
914
|
-
Whether the schema should be in order.
|
|
915
|
-
case_sensitive_colnames
|
|
916
|
-
Whether column names are case-sensitive.
|
|
917
|
-
case_sensitive_dtypes
|
|
918
|
-
Whether data types are case-sensitive.
|
|
919
|
-
full_match_dtypes
|
|
920
|
-
Whether data types must match exactly.
|
|
921
|
-
|
|
922
|
-
Returns
|
|
923
|
-
-------
|
|
924
|
-
bool
|
|
925
|
-
True if the schema matches, False otherwise.
|
|
926
|
-
"""
|
|
927
|
-
validation_info = _get_schema_validation_info(
|
|
928
|
-
data_tbl=data_tbl,
|
|
929
|
-
schema=schema,
|
|
930
|
-
passed=False, # This will be determined by the logic below
|
|
931
|
-
complete=complete,
|
|
932
|
-
in_order=in_order,
|
|
933
|
-
case_sensitive_colnames=case_sensitive_colnames,
|
|
934
|
-
case_sensitive_dtypes=case_sensitive_dtypes,
|
|
935
|
-
full_match_dtypes=full_match_dtypes,
|
|
936
|
-
)
|
|
937
|
-
|
|
938
|
-
# Determine if the schema validation passed based on the validation info
|
|
939
|
-
passed = True
|
|
940
|
-
|
|
941
|
-
# Check completeness requirement
|
|
942
|
-
if complete and not validation_info["columns_full_set"]:
|
|
943
|
-
passed = False
|
|
944
|
-
|
|
945
|
-
# Check order requirement
|
|
946
|
-
if in_order and not validation_info["columns_matched_in_order"]:
|
|
947
|
-
passed = False
|
|
948
|
-
|
|
949
|
-
# Check if all expected columns were found
|
|
950
|
-
if validation_info["columns_not_found"]:
|
|
951
|
-
passed = False
|
|
952
|
-
|
|
953
|
-
# Check column-specific validations
|
|
954
|
-
for col_info in validation_info["columns"].values():
|
|
955
|
-
if not col_info["colname_matched"]:
|
|
956
|
-
passed = False
|
|
957
|
-
if not col_info.get(
|
|
958
|
-
"dtype_matched", True
|
|
959
|
-
): # dtype_matched may not exist if no dtypes specified
|
|
960
|
-
passed = False
|
|
961
|
-
|
|
962
|
-
return passed
|
|
963
|
-
|
|
964
|
-
|
|
965
891
|
def _get_schema_validation_info(
|
|
966
892
|
data_tbl: any,
|
|
967
893
|
schema: Schema,
|
pointblank/thresholds.py
CHANGED
|
@@ -559,7 +559,7 @@ class FinalActions:
|
|
|
559
559
|
def send_alert():
|
|
560
560
|
summary = pb.get_validation_summary()
|
|
561
561
|
if summary["highest_severity"] == "critical":
|
|
562
|
-
print(f"ALERT: Critical validation failures found in {summary['
|
|
562
|
+
print(f"ALERT: Critical validation failures found in {summary['tbl_name']}")
|
|
563
563
|
|
|
564
564
|
validation = (
|
|
565
565
|
pb.Validate(
|