pointblank 0.13.4__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. pointblank/__init__.py +4 -0
  2. pointblank/_constants.py +54 -0
  3. pointblank/_constants_translations.py +487 -2
  4. pointblank/_interrogation.py +182 -11
  5. pointblank/_utils.py +3 -3
  6. pointblank/_utils_ai.py +850 -0
  7. pointblank/cli.py +128 -115
  8. pointblank/column.py +1 -1
  9. pointblank/data/api-docs.txt +198 -13
  10. pointblank/data/validations/README.md +108 -0
  11. pointblank/data/validations/complex_preprocessing.json +54 -0
  12. pointblank/data/validations/complex_preprocessing.pkl +0 -0
  13. pointblank/data/validations/generate_test_files.py +127 -0
  14. pointblank/data/validations/multiple_steps.json +83 -0
  15. pointblank/data/validations/multiple_steps.pkl +0 -0
  16. pointblank/data/validations/narwhals_function.json +28 -0
  17. pointblank/data/validations/narwhals_function.pkl +0 -0
  18. pointblank/data/validations/no_preprocessing.json +83 -0
  19. pointblank/data/validations/no_preprocessing.pkl +0 -0
  20. pointblank/data/validations/pandas_compatible.json +28 -0
  21. pointblank/data/validations/pandas_compatible.pkl +0 -0
  22. pointblank/data/validations/preprocessing_functions.py +46 -0
  23. pointblank/data/validations/simple_preprocessing.json +57 -0
  24. pointblank/data/validations/simple_preprocessing.pkl +0 -0
  25. pointblank/datascan.py +4 -4
  26. pointblank/scan_profile.py +6 -6
  27. pointblank/schema.py +8 -82
  28. pointblank/thresholds.py +1 -1
  29. pointblank/validate.py +1233 -12
  30. {pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/METADATA +66 -8
  31. pointblank-0.14.0.dist-info/RECORD +55 -0
  32. pointblank-0.13.4.dist-info/RECORD +0 -39
  33. {pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/WHEEL +0 -0
  34. {pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/entry_points.txt +0 -0
  35. {pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/licenses/LICENSE +0 -0
  36. {pointblank-0.13.4.dist-info → pointblank-0.14.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,83 @@
1
+ [
2
+ {
3
+ "i": 1,
4
+ "i_o": 1,
5
+ "assertion_type": "col_vals_gt",
6
+ "column": "a",
7
+ "values": 2,
8
+ "inclusive": null,
9
+ "na_pass": false,
10
+ "pre": "def double_column_a(df):\n \"\"\"Double the values in column 'a'.\"\"\"\n return df.with_columns(pl.col(\"a\") * 2)",
11
+ "segments": null,
12
+ "thresholds": "Thresholds(warning=None, error=None, critical=None)",
13
+ "label": null,
14
+ "brief": null,
15
+ "active": true,
16
+ "all_passed": false,
17
+ "n": 10,
18
+ "n_passed": 9,
19
+ "n_failed": 1,
20
+ "f_passed": 0.9,
21
+ "f_failed": 0.1,
22
+ "warning": null,
23
+ "error": null,
24
+ "critical": null,
25
+ "time_processed": "2025-10-02T04:16:44.712+00:00",
26
+ "proc_duration_s": 0.00152
27
+ },
28
+ {
29
+ "i": 2,
30
+ "i_o": 2,
31
+ "assertion_type": "col_vals_in_set",
32
+ "column": "c",
33
+ "values": [
34
+ "x",
35
+ "y"
36
+ ],
37
+ "inclusive": null,
38
+ "na_pass": null,
39
+ "pre": "def filter_by_d_gt_100(df):\n \"\"\"Filter rows where column 'd' is greater than 100.\"\"\"\n return df.filter(pl.col(\"d\") > 100)",
40
+ "segments": null,
41
+ "thresholds": "Thresholds(warning=None, error=None, critical=None)",
42
+ "label": null,
43
+ "brief": null,
44
+ "active": true,
45
+ "all_passed": true,
46
+ "n": 7,
47
+ "n_passed": 7,
48
+ "n_failed": 0,
49
+ "f_passed": 1.0,
50
+ "f_failed": 0.0,
51
+ "warning": null,
52
+ "error": null,
53
+ "critical": null,
54
+ "time_processed": "2025-10-02T04:16:44.713+00:00",
55
+ "proc_duration_s": 0.001
56
+ },
57
+ {
58
+ "i": 3,
59
+ "i_o": 3,
60
+ "assertion_type": "col_vals_gt",
61
+ "column": "sum_ab",
62
+ "values": 100,
63
+ "inclusive": null,
64
+ "na_pass": false,
65
+ "pre": "def add_computed_column(df):\n \"\"\"Add a computed column based on existing columns.\"\"\"\n return df.with_columns((pl.col(\"a\") + pl.col(\"b\")).alias(\"sum_ab\"))",
66
+ "segments": null,
67
+ "thresholds": "Thresholds(warning=None, error=None, critical=None)",
68
+ "label": null,
69
+ "brief": null,
70
+ "active": true,
71
+ "all_passed": false,
72
+ "n": 10,
73
+ "n_passed": 1,
74
+ "n_failed": 9,
75
+ "f_passed": 0.1,
76
+ "f_failed": 0.9,
77
+ "warning": null,
78
+ "error": null,
79
+ "critical": null,
80
+ "time_processed": "2025-10-02T04:16:44.714+00:00",
81
+ "proc_duration_s": 0.001464
82
+ }
83
+ ]
@@ -0,0 +1,28 @@
1
+ [
2
+ {
3
+ "i": 1,
4
+ "i_o": 1,
5
+ "assertion_type": "col_vals_gt",
6
+ "column": "a",
7
+ "values": 5,
8
+ "inclusive": null,
9
+ "na_pass": false,
10
+ "pre": "def narwhals_median_transform(df):\n \"\"\"Use narwhals to compute median - cross-backend compatible.\"\"\"\n return nw.from_native(df).select(nw.median(\"a\"), nw.median(\"d\"))",
11
+ "segments": null,
12
+ "thresholds": "Thresholds(warning=None, error=None, critical=None)",
13
+ "label": null,
14
+ "brief": null,
15
+ "active": true,
16
+ "all_passed": true,
17
+ "n": 1,
18
+ "n_passed": 1,
19
+ "n_failed": 0,
20
+ "f_passed": 1.0,
21
+ "f_failed": 0.0,
22
+ "warning": null,
23
+ "error": null,
24
+ "critical": null,
25
+ "time_processed": "2025-10-02T04:16:44.710+00:00",
26
+ "proc_duration_s": 0.001455
27
+ }
28
+ ]
@@ -0,0 +1,83 @@
1
+ [
2
+ {
3
+ "i": 1,
4
+ "i_o": 1,
5
+ "assertion_type": "col_vals_gt",
6
+ "column": "a",
7
+ "values": 0,
8
+ "inclusive": null,
9
+ "na_pass": false,
10
+ "pre": null,
11
+ "segments": null,
12
+ "thresholds": "Thresholds(warning=None, error=None, critical=None)",
13
+ "label": null,
14
+ "brief": null,
15
+ "active": true,
16
+ "all_passed": true,
17
+ "n": 10,
18
+ "n_passed": 10,
19
+ "n_failed": 0,
20
+ "f_passed": 1.0,
21
+ "f_failed": 0.0,
22
+ "warning": null,
23
+ "error": null,
24
+ "critical": null,
25
+ "time_processed": "2025-10-02T04:16:44.718+00:00",
26
+ "proc_duration_s": 0.001148
27
+ },
28
+ {
29
+ "i": 2,
30
+ "i_o": 2,
31
+ "assertion_type": "col_vals_lt",
32
+ "column": "d",
33
+ "values": 300,
34
+ "inclusive": null,
35
+ "na_pass": false,
36
+ "pre": null,
37
+ "segments": null,
38
+ "thresholds": "Thresholds(warning=None, error=None, critical=None)",
39
+ "label": null,
40
+ "brief": null,
41
+ "active": true,
42
+ "all_passed": true,
43
+ "n": 10,
44
+ "n_passed": 10,
45
+ "n_failed": 0,
46
+ "f_passed": 1.0,
47
+ "f_failed": 0.0,
48
+ "warning": null,
49
+ "error": null,
50
+ "critical": null,
51
+ "time_processed": "2025-10-02T04:16:44.719+00:00",
52
+ "proc_duration_s": 0.001181
53
+ },
54
+ {
55
+ "i": 3,
56
+ "i_o": 3,
57
+ "assertion_type": "col_vals_in_set",
58
+ "column": "c",
59
+ "values": [
60
+ "x",
61
+ "y"
62
+ ],
63
+ "inclusive": null,
64
+ "na_pass": null,
65
+ "pre": null,
66
+ "segments": null,
67
+ "thresholds": "Thresholds(warning=None, error=None, critical=None)",
68
+ "label": null,
69
+ "brief": null,
70
+ "active": true,
71
+ "all_passed": true,
72
+ "n": 10,
73
+ "n_passed": 10,
74
+ "n_failed": 0,
75
+ "f_passed": 1.0,
76
+ "f_failed": 0.0,
77
+ "warning": null,
78
+ "error": null,
79
+ "critical": null,
80
+ "time_processed": "2025-10-02T04:16:44.720+00:00",
81
+ "proc_duration_s": 0.000892
82
+ }
83
+ ]
@@ -0,0 +1,28 @@
1
+ [
2
+ {
3
+ "i": 1,
4
+ "i_o": 1,
5
+ "assertion_type": "col_vals_gt",
6
+ "column": "a_plus_b",
7
+ "values": 10,
8
+ "inclusive": null,
9
+ "na_pass": false,
10
+ "pre": "def pandas_compatible_transform(df):\n \"\"\"Transform that works with pandas DataFrames.\"\"\"\n if hasattr(df, \"assign\"): # pandas\n return df.assign(a_plus_b=df[\"a\"] + df.get(\"b\", 0))\n else: # polars or other\n return df.with_columns((pl.col(\"a\") + pl.col(\"b\")).alias(\"a_plus_b\"))",
11
+ "segments": null,
12
+ "thresholds": "Thresholds(warning=None, error=None, critical=None)",
13
+ "label": null,
14
+ "brief": null,
15
+ "active": true,
16
+ "all_passed": true,
17
+ "n": 10,
18
+ "n_passed": 10,
19
+ "n_failed": 0,
20
+ "f_passed": 1.0,
21
+ "f_failed": 0.0,
22
+ "warning": null,
23
+ "error": null,
24
+ "critical": null,
25
+ "time_processed": "2025-10-02T04:16:44.717+00:00",
26
+ "proc_duration_s": 0.001428
27
+ }
28
+ ]
@@ -0,0 +1,46 @@
1
+ """
2
+ Test preprocessing functions for validation serialization examples.
3
+
4
+ These functions are used to create validation objects that can be serialized
5
+ and stored as reference files for regression testing.
6
+ """
7
+
8
+ import narwhals as nw
9
+ import polars as pl
10
+
11
+
12
+ def double_column_a(df):
13
+ """Double the values in column 'a'."""
14
+ return df.with_columns(pl.col("a") * 2)
15
+
16
+
17
+ def add_computed_column(df):
18
+ """Add a computed column based on existing columns."""
19
+ return df.with_columns((pl.col("a") + pl.col("b")).alias("sum_ab"))
20
+
21
+
22
+ def filter_by_d_gt_100(df):
23
+ """Filter rows where column 'd' is greater than 100."""
24
+ return df.filter(pl.col("d") > 100)
25
+
26
+
27
+ def narwhals_median_transform(df):
28
+ """Use narwhals to compute median - cross-backend compatible."""
29
+ return nw.from_native(df).select(nw.median("a"), nw.median("d"))
30
+
31
+
32
+ def complex_preprocessing(df):
33
+ """Complex preprocessing combining multiple operations."""
34
+ return (
35
+ df.filter(pl.col("a") > 1)
36
+ .with_columns((pl.col("a") * 2).alias("a_doubled"), (pl.col("d") / 10).alias("d_scaled"))
37
+ .filter(pl.col("d_scaled") > 10)
38
+ )
39
+
40
+
41
+ def pandas_compatible_transform(df):
42
+ """Transform that works with pandas DataFrames."""
43
+ if hasattr(df, "assign"): # pandas
44
+ return df.assign(a_plus_b=df["a"] + df.get("b", 0))
45
+ else: # polars or other
46
+ return df.with_columns((pl.col("a") + pl.col("b")).alias("a_plus_b"))
@@ -0,0 +1,57 @@
1
+ [
2
+ {
3
+ "i": 1,
4
+ "i_o": 1,
5
+ "assertion_type": "col_vals_gt",
6
+ "column": "a",
7
+ "values": 0,
8
+ "inclusive": null,
9
+ "na_pass": false,
10
+ "pre": "def double_column_a(df):\n \"\"\"Double the values in column 'a'.\"\"\"\n return df.with_columns(pl.col(\"a\") * 2)",
11
+ "segments": null,
12
+ "thresholds": "Thresholds(warning=None, error=None, critical=None)",
13
+ "label": null,
14
+ "brief": null,
15
+ "active": true,
16
+ "all_passed": true,
17
+ "n": 10,
18
+ "n_passed": 10,
19
+ "n_failed": 0,
20
+ "f_passed": 1.0,
21
+ "f_failed": 0.0,
22
+ "warning": null,
23
+ "error": null,
24
+ "critical": null,
25
+ "time_processed": "2025-10-02T04:16:44.702+00:00",
26
+ "proc_duration_s": 0.00387
27
+ },
28
+ {
29
+ "i": 2,
30
+ "i_o": 2,
31
+ "assertion_type": "col_vals_in_set",
32
+ "column": "c",
33
+ "values": [
34
+ "x",
35
+ "y"
36
+ ],
37
+ "inclusive": null,
38
+ "na_pass": null,
39
+ "pre": null,
40
+ "segments": null,
41
+ "thresholds": "Thresholds(warning=None, error=None, critical=None)",
42
+ "label": null,
43
+ "brief": null,
44
+ "active": true,
45
+ "all_passed": true,
46
+ "n": 10,
47
+ "n_passed": 10,
48
+ "n_failed": 0,
49
+ "f_passed": 1.0,
50
+ "f_failed": 0.0,
51
+ "warning": null,
52
+ "error": null,
53
+ "critical": null,
54
+ "time_processed": "2025-10-02T04:16:44.703+00:00",
55
+ "proc_duration_s": 0.000983
56
+ }
57
+ ]
pointblank/datascan.py CHANGED
@@ -143,17 +143,17 @@ class DataScan:
143
143
  for conv_method in valid_conversion_methods:
144
144
  try:
145
145
  valid_native = getattr(ibis_native, conv_method)()
146
- except (NotImplementedError, ImportError, ModuleNotFoundError):
147
- continue
146
+ except (NotImplementedError, ImportError, ModuleNotFoundError): # pragma: no cover
147
+ continue # pragma: no cover
148
148
  break
149
- else:
149
+ else: # pragma: no cover
150
150
  msg = (
151
151
  "To use `ibis` as input, you must have one of arrow, pandas, polars or numpy "
152
152
  "available in the process. Until `ibis` is fully supported by Narwhals, this is "
153
153
  "necessary. Additionally, the data must be collected in order to calculate some "
154
154
  "structural statistics, which may be performance detrimental."
155
155
  )
156
- raise ImportError(msg)
156
+ raise ImportError(msg) # pragma: no cover
157
157
  as_native = nw.from_native(valid_native)
158
158
 
159
159
  self.nw_data: Frame = nw.from_native(as_native)
@@ -299,12 +299,12 @@ class _DataProfile: # TODO: feels redundant and weird
299
299
  # instantiations that require consistent types.
300
300
  all_same_type: bool = all(type(v) is first_type for v in values[1:])
301
301
  if not all_same_type:
302
- if strict:
303
- msg = f"Some types in {key!s} stat are different. Turn off `strict` to bypass."
304
- raise TypeError(msg)
305
- for d in cols:
306
- if key in d:
307
- d[key] = str(d[key])
302
+ if strict: # pragma: no cover
303
+ msg = f"Some types in {key!s} stat are different. Turn off `strict` to bypass." # pragma: no cover
304
+ raise TypeError(msg) # pragma: no cover
305
+ for d in cols: # pragma: no cover
306
+ if key in d: # pragma: no cover
307
+ d[key] = str(d[key]) # pragma: no cover
308
308
 
309
309
  return nw.from_dict(transpose_dicts(cols), backend=self.implementation)
310
310
 
pointblank/schema.py CHANGED
@@ -343,15 +343,15 @@ class Schema:
343
343
  schema_dict = {k: str(v) for k, v in schema_dict.items()}
344
344
  self.columns = list(schema_dict.items())
345
345
 
346
- elif table_type == "pyspark":
346
+ elif table_type == "pyspark": # pragma: no cover
347
347
  # Convert PySpark DataFrame to Narwhals to get schema
348
- nw_df = nw.from_native(self.tbl)
349
- if _is_lazy_frame(data=nw_df):
350
- schema_dict = dict(nw_df.collect_schema())
351
- else:
352
- schema_dict = dict(nw_df.schema.items())
353
- schema_dict = {k: str(v) for k, v in schema_dict.items()}
354
- self.columns = list(schema_dict.items())
348
+ nw_df = nw.from_native(self.tbl) # pragma: no cover
349
+ if _is_lazy_frame(data=nw_df): # pragma: no cover
350
+ schema_dict = dict(nw_df.collect_schema()) # pragma: no cover
351
+ else: # pragma: no cover
352
+ schema_dict = dict(nw_df.schema.items()) # pragma: no cover
353
+ schema_dict = {k: str(v) for k, v in schema_dict.items()} # pragma: no cover
354
+ self.columns = list(schema_dict.items()) # pragma: no cover
355
355
 
356
356
  elif table_type in IBIS_BACKENDS:
357
357
  schema_dict = dict(self.tbl.schema().items())
@@ -888,80 +888,6 @@ def _schema_info_generate_params_dict(
888
888
  }
889
889
 
890
890
 
891
- def _check_schema_match(
892
- data_tbl: any,
893
- schema: Schema,
894
- complete: bool = True,
895
- in_order: bool = True,
896
- case_sensitive_colnames: bool = True,
897
- case_sensitive_dtypes: bool = True,
898
- full_match_dtypes: bool = True,
899
- ) -> bool:
900
- """
901
- Check if the schema matches the target table.
902
-
903
- This function performs schema validation and returns a boolean result.
904
-
905
- Parameters
906
- ----------
907
- data_tbl
908
- The target table to validate.
909
- schema
910
- The expected schema.
911
- complete
912
- Whether the schema should be complete.
913
- in_order
914
- Whether the schema should be in order.
915
- case_sensitive_colnames
916
- Whether column names are case-sensitive.
917
- case_sensitive_dtypes
918
- Whether data types are case-sensitive.
919
- full_match_dtypes
920
- Whether data types must match exactly.
921
-
922
- Returns
923
- -------
924
- bool
925
- True if the schema matches, False otherwise.
926
- """
927
- validation_info = _get_schema_validation_info(
928
- data_tbl=data_tbl,
929
- schema=schema,
930
- passed=False, # This will be determined by the logic below
931
- complete=complete,
932
- in_order=in_order,
933
- case_sensitive_colnames=case_sensitive_colnames,
934
- case_sensitive_dtypes=case_sensitive_dtypes,
935
- full_match_dtypes=full_match_dtypes,
936
- )
937
-
938
- # Determine if the schema validation passed based on the validation info
939
- passed = True
940
-
941
- # Check completeness requirement
942
- if complete and not validation_info["columns_full_set"]:
943
- passed = False
944
-
945
- # Check order requirement
946
- if in_order and not validation_info["columns_matched_in_order"]:
947
- passed = False
948
-
949
- # Check if all expected columns were found
950
- if validation_info["columns_not_found"]:
951
- passed = False
952
-
953
- # Check column-specific validations
954
- for col_info in validation_info["columns"].values():
955
- if not col_info["colname_matched"]:
956
- passed = False
957
- if not col_info.get(
958
- "dtype_matched", True
959
- ): # dtype_matched may not exist if no dtypes specified
960
- passed = False
961
-
962
- return passed
963
-
964
-
965
891
  def _get_schema_validation_info(
966
892
  data_tbl: any,
967
893
  schema: Schema,
pointblank/thresholds.py CHANGED
@@ -559,7 +559,7 @@ class FinalActions:
559
559
  def send_alert():
560
560
  summary = pb.get_validation_summary()
561
561
  if summary["highest_severity"] == "critical":
562
- print(f"ALERT: Critical validation failures found in {summary['table_name']}")
562
+ print(f"ALERT: Critical validation failures found in {summary['tbl_name']}")
563
563
 
564
564
  validation = (
565
565
  pb.Validate(