pointblank 0.9.5__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pointblank/validate.py CHANGED
@@ -17,6 +17,7 @@ from zipfile import ZipFile
17
17
  import commonmark
18
18
  import narwhals as nw
19
19
  from great_tables import GT, from_column, google_font, html, loc, md, style, vals
20
+ from great_tables.gt import _get_column_of_values
20
21
  from great_tables.vals import fmt_integer, fmt_number
21
22
  from importlib_resources import files
22
23
  from narwhals.typing import FrameT
@@ -64,11 +65,15 @@ from pointblank._typing import SegmentSpec
64
65
  from pointblank._utils import (
65
66
  _check_any_df_lib,
66
67
  _check_invalid_fields,
68
+ _count_null_values_in_column,
69
+ _count_true_values_in_column,
67
70
  _derive_bounds,
68
71
  _format_to_integer_value,
69
72
  _get_fn_name,
70
73
  _get_tbl_type,
74
+ _is_lazy_frame,
71
75
  _is_lib_present,
76
+ _is_narwhals_table,
72
77
  _is_value_a_df,
73
78
  _select_df_lib,
74
79
  )
@@ -99,11 +104,13 @@ __all__ = [
99
104
  "Validate",
100
105
  "load_dataset",
101
106
  "config",
107
+ "connect_to_table",
102
108
  "preview",
103
109
  "missing_vals_tbl",
110
+ "get_action_metadata",
104
111
  "get_column_count",
112
+ "get_data_path",
105
113
  "get_row_count",
106
- "get_action_metadata",
107
114
  "get_validation_summary",
108
115
  ]
109
116
 
@@ -495,7 +502,9 @@ def load_dataset(
495
502
  raise ValueError(
496
503
  f"The dataset name `{dataset}` is not valid. Choose one of the following:\n"
497
504
  "- `small_table`\n"
498
- "- `game_revenue`"
505
+ "- `game_revenue`\n"
506
+ "- `nycflights`\n"
507
+ "- `global_sales`"
499
508
  )
500
509
 
501
510
  # Raise an error if the `tbl_type=` value is not of the supported types
@@ -560,6 +569,405 @@ def load_dataset(
560
569
  return dataset
561
570
 
562
571
 
572
+ def get_data_path(
573
+ dataset: Literal["small_table", "game_revenue", "nycflights", "global_sales"] = "small_table",
574
+ file_type: Literal["csv", "parquet", "duckdb"] = "csv",
575
+ ) -> str:
576
+ """
577
+ Get the file path to a dataset included with the Pointblank package.
578
+
579
+ This function provides direct access to the file paths of datasets included with Pointblank.
580
+ These paths can be used in examples and documentation to demonstrate file-based data loading
581
+ without requiring the actual data files. The returned paths can be used with
582
+ `Validate(data=path)` to demonstrate CSV and Parquet file loading capabilities.
583
+
584
+ Parameters
585
+ ----------
586
+ dataset
587
+ The name of the dataset to get the path for. Current options are `"small_table"`,
588
+ `"game_revenue"`, `"nycflights"`, and `"global_sales"`.
589
+ file_type
590
+ The file format to get the path for. Options are `"csv"`, `"parquet"`, or `"duckdb"`.
591
+
592
+ Returns
593
+ -------
594
+ str
595
+ The file path to the requested dataset file.
596
+
597
+ Included Datasets
598
+ -----------------
599
+ The available datasets are the same as those in [`load_dataset()`](`pointblank.load_dataset`):
600
+
601
+ - `"small_table"`: A small dataset with 13 rows and 8 columns. Ideal for testing and examples.
602
+ - `"game_revenue"`: A dataset with 2000 rows and 11 columns. Revenue data for a game company.
603
+ - `"nycflights"`: A dataset with 336,776 rows and 18 columns. Flight data from NYC airports.
604
+ - `"global_sales"`: A dataset with 50,000 rows and 20 columns. Global sales data across regions.
605
+
606
+ File Types
607
+ ----------
608
+ Each dataset is available in multiple formats:
609
+
610
+ - `"csv"`: Comma-separated values file (`.csv`)
611
+ - `"parquet"`: Parquet file (`.parquet`)
612
+ - `"duckdb"`: DuckDB database file (`.ddb`)
613
+
614
+ Examples
615
+ --------
616
+ Get the path to a CSV file and use it with `Validate`:
617
+
618
+ ```{python}
619
+ import pointblank as pb
620
+
621
+ # Get path to the small_table CSV file
622
+ csv_path = pb.get_data_path("small_table", "csv")
623
+ print(csv_path)
624
+
625
+ # Use the path directly with Validate
626
+ validation = (
627
+ pb.Validate(data=csv_path)
628
+ .col_exists(["a", "b", "c"])
629
+ .col_vals_gt(columns="d", value=0)
630
+ .interrogate()
631
+ )
632
+
633
+ validation
634
+ ```
635
+
636
+ Get a Parquet file path for validation examples:
637
+
638
+ ```{python}
639
+ # Get path to the game_revenue Parquet file
640
+ parquet_path = pb.get_data_path(dataset="game_revenue", file_type="parquet")
641
+
642
+ # Validate the Parquet file directly
643
+ validation = (
644
+ pb.Validate(data=parquet_path, label="Game Revenue Data Validation")
645
+ .col_vals_not_null(columns=["player_id", "session_id"])
646
+ .col_vals_gt(columns="item_revenue", value=0)
647
+ .interrogate()
648
+ )
649
+
650
+ validation
651
+ ```
652
+
653
+ This is particularly useful for documentation examples where you want to demonstrate
654
+ file-based workflows without requiring users to have specific data files:
655
+
656
+ ```{python}
657
+ # Example showing CSV file validation
658
+ sales_csv = pb.get_data_path(dataset="global_sales", file_type="csv")
659
+
660
+ validation = (
661
+ pb.Validate(data=sales_csv, label="Sales Data Validation")
662
+ .col_exists(["customer_id", "product_id", "amount"])
663
+ .col_vals_regex(columns="customer_id", pattern=r"CUST_[0-9]{6}")
664
+ .interrogate()
665
+ )
666
+ ```
667
+
668
+ See Also
669
+ --------
670
+ [`load_dataset()`](`pointblank.load_dataset`) for loading datasets directly as table objects.
671
+ """
672
+
673
+ # Validate inputs
674
+ if dataset not in ["small_table", "game_revenue", "nycflights", "global_sales"]:
675
+ raise ValueError(
676
+ f"The dataset name `{dataset}` is not valid. Choose one of the following:\n"
677
+ "- `small_table`\n"
678
+ "- `game_revenue`\n"
679
+ "- `nycflights`\n"
680
+ "- `global_sales`"
681
+ )
682
+
683
+ if file_type not in ["csv", "parquet", "duckdb"]:
684
+ raise ValueError(
685
+ f"The file type `{file_type}` is not valid. Choose one of the following:\n"
686
+ "- `csv`\n"
687
+ "- `parquet`\n"
688
+ "- `duckdb`"
689
+ )
690
+
691
+ if file_type == "csv":
692
+ # Return path to CSV file inside the zip
693
+ data_path = files("pointblank.data") / f"{dataset}.zip"
694
+
695
+ # For CSV files, we need to extract from zip to a temporary location
696
+ # since most libraries expect actual file paths, not zip contents
697
+ with tempfile.NamedTemporaryFile(mode="wb", suffix=".csv", delete=False) as tmp_file:
698
+ with ZipFile(data_path) as zip_file:
699
+ csv_content = zip_file.read(f"{dataset}.csv")
700
+ tmp_file.write(csv_content)
701
+ return tmp_file.name
702
+
703
+ elif file_type == "parquet":
704
+ # Create a temporary parquet file from the CSV data
705
+ data_path = files("pointblank.data") / f"{dataset}.zip"
706
+
707
+ # We'll need to convert CSV to Parquet temporarily
708
+ with tempfile.NamedTemporaryFile(mode="wb", suffix=".parquet", delete=False) as tmp_file:
709
+ # Load CSV data and save as Parquet
710
+ if _is_lib_present(lib_name="polars"):
711
+ import polars as pl
712
+
713
+ df = pl.read_csv(ZipFile(data_path).read(f"{dataset}.csv"), try_parse_dates=True)
714
+ df.write_parquet(tmp_file.name)
715
+ elif _is_lib_present(lib_name="pandas"):
716
+ import pandas as pd
717
+
718
+ df = pd.read_csv(data_path)
719
+ df.to_parquet(tmp_file.name, index=False)
720
+ else:
721
+ raise ImportError(
722
+ "Either Polars or Pandas is required to create temporary Parquet files."
723
+ )
724
+ return tmp_file.name
725
+
726
+ elif file_type == "duckdb":
727
+ # Return path to DuckDB file
728
+ data_path = files("pointblank.data") / f"{dataset}-duckdb.zip"
729
+
730
+ # Extract DuckDB file to temporary location
731
+ with tempfile.NamedTemporaryFile(mode="wb", suffix=".ddb", delete=False) as tmp_file:
732
+ with ZipFile(data_path) as zip_file:
733
+ ddb_content = zip_file.read(f"{dataset}.ddb")
734
+ tmp_file.write(ddb_content)
735
+ return tmp_file.name
736
+
737
+
738
+ # =============================================================================
739
+ # Utility functions for processing input data (shared by preview() and Validate class)
740
+ # =============================================================================
741
+
742
+
743
+ def _process_connection_string(data: FrameT | Any) -> FrameT | Any:
744
+ """
745
+ Process data parameter to handle database connection strings.
746
+
747
+ Uses the `connect_to_table()` utility function to handle URI-formatted connection strings with
748
+ table specifications. Returns the original data if it's not a connection string.
749
+
750
+ For more details on supported connection string formats, see the documentation
751
+ for `connect_to_table()`.
752
+ """
753
+ # Check if data is a string that looks like a connection string
754
+ if not isinstance(data, str):
755
+ return data
756
+
757
+ # Basic connection string patterns
758
+ connection_patterns = [
759
+ "://", # General URL-like pattern
760
+ ]
761
+
762
+ # Check if it looks like a connection string
763
+ if not any(pattern in data for pattern in connection_patterns):
764
+ return data
765
+
766
+ # Use the utility function to connect to the table
767
+ return connect_to_table(data)
768
+
769
+
770
+ def _process_csv_input(data: FrameT | Any) -> FrameT | Any:
771
+ """
772
+ Process data parameter to handle CSV file inputs.
773
+
774
+ If data is a string or Path with .csv extension, reads the CSV file
775
+ using available libraries (Polars preferred, then Pandas).
776
+
777
+ Returns the original data if it's not a CSV file path.
778
+ """
779
+ from pathlib import Path
780
+
781
+ # Check if data is a string or Path-like object with .csv extension
782
+ csv_path = None
783
+
784
+ if isinstance(data, (str, Path)):
785
+ path_obj = Path(data)
786
+ if path_obj.suffix.lower() == ".csv":
787
+ csv_path = path_obj
788
+
789
+ # If it's not a CSV file path, return the original data
790
+ if csv_path is None:
791
+ return data
792
+
793
+ # Check if the CSV file exists
794
+ if not csv_path.exists():
795
+ raise FileNotFoundError(f"CSV file not found: {csv_path}")
796
+
797
+ # Determine which library to use for reading CSV
798
+ # Prefer Polars, fallback to Pandas
799
+ if _is_lib_present(lib_name="polars"):
800
+ try:
801
+ import polars as pl
802
+
803
+ return pl.read_csv(csv_path, try_parse_dates=True)
804
+ except Exception as e:
805
+ # If Polars fails, try Pandas if available
806
+ if _is_lib_present(lib_name="pandas"):
807
+ import pandas as pd
808
+
809
+ return pd.read_csv(csv_path)
810
+ else:
811
+ raise RuntimeError(
812
+ f"Failed to read CSV file with Polars: {e}. "
813
+ "Pandas is not available as fallback."
814
+ ) from e
815
+ elif _is_lib_present(lib_name="pandas"):
816
+ try:
817
+ import pandas as pd
818
+
819
+ return pd.read_csv(csv_path)
820
+ except Exception as e:
821
+ raise RuntimeError(f"Failed to read CSV file with Pandas: {e}") from e
822
+ else:
823
+ raise ImportError(
824
+ "Neither Polars nor Pandas is available for reading CSV files. "
825
+ "Please install either 'polars' or 'pandas' to use CSV file inputs."
826
+ )
827
+
828
+
829
+ def _process_parquet_input(data: FrameT | Any) -> FrameT | Any:
830
+ """
831
+ Process data parameter to handle Parquet file inputs.
832
+
833
+ Supports:
834
+ - single .parquet file (string or Path)
835
+ - glob patterns for multiple .parquet files (e.g., "data/*.parquet")
836
+ - directory containing .parquet files
837
+ - partitioned Parquet datasets with automatic partition column inference
838
+ - list/sequence of .parquet file paths
839
+
840
+ Returns the original data if it's not a Parquet file input.
841
+ """
842
+ import glob
843
+ from pathlib import Path
844
+
845
+ parquet_paths = []
846
+
847
+ # Handle different input types
848
+ if isinstance(data, (str, Path)):
849
+ data_str = str(data)
850
+ path_obj = Path(data)
851
+
852
+ # Check if it's a glob pattern containing .parquet first; look for glob
853
+ # characters: `*`, `?`, `[`, `]`
854
+ if ".parquet" in data_str.lower() and any(
855
+ char in data_str for char in ["*", "?", "[", "]"]
856
+ ):
857
+ parquet_files = glob.glob(data_str)
858
+ if parquet_files:
859
+ parquet_paths = sorted([Path(f) for f in parquet_files])
860
+ else:
861
+ raise FileNotFoundError(f"No files found matching pattern: {data}")
862
+
863
+ # Check if it's a single .parquet file
864
+ elif path_obj.suffix.lower() == ".parquet":
865
+ if path_obj.exists():
866
+ parquet_paths = [path_obj]
867
+ else:
868
+ raise FileNotFoundError(f"Parquet file not found: {path_obj}")
869
+
870
+ # Check if it's a directory
871
+ elif path_obj.is_dir():
872
+ # First, try to read as a partitioned parquet dataset; This handles datasets where
873
+ # Parquet files are in subdirectories with partition columns encoded in paths
874
+ try:
875
+ # Both Polars and Pandas can handle partitioned datasets natively
876
+ if _is_lib_present(lib_name="polars"):
877
+ import polars as pl
878
+
879
+ # Try reading as partitioned dataset first
880
+ df = pl.read_parquet(str(path_obj))
881
+ return df
882
+ elif _is_lib_present(lib_name="pandas"):
883
+ import pandas as pd
884
+
885
+ # Try reading as partitioned dataset first
886
+ df = pd.read_parquet(str(path_obj))
887
+ return df
888
+ except Exception:
889
+ # If partitioned read fails, fall back to simple directory scan
890
+ pass
891
+
892
+ # Fallback: Look for .parquet files directly in the directory
893
+ parquet_files = list(path_obj.glob("*.parquet"))
894
+ if parquet_files:
895
+ parquet_paths = sorted(parquet_files)
896
+ else:
897
+ raise FileNotFoundError(
898
+ f"No .parquet files found in directory: {path_obj}. "
899
+ f"This could be a non-partitioned directory without .parquet files, "
900
+ f"or a partitioned dataset that couldn't be read."
901
+ )
902
+
903
+ # If it's not a parquet file, directory, or glob pattern, return original data
904
+ else:
905
+ return data
906
+
907
+ # Handle list/sequence of paths
908
+ elif isinstance(data, (list, tuple)):
909
+ for item in data:
910
+ item_path = Path(item)
911
+ if item_path.suffix.lower() == ".parquet":
912
+ if item_path.exists():
913
+ parquet_paths.append(item_path)
914
+ else:
915
+ raise FileNotFoundError(f"Parquet file not found: {item_path}")
916
+ else:
917
+ # If any item is not a parquet file, return original data
918
+ return data
919
+
920
+ # If no parquet files found, return original data
921
+ if not parquet_paths:
922
+ return data
923
+
924
+ # Read the parquet file(s) using available libraries; prefer Polars, fallback to Pandas
925
+ if _is_lib_present(lib_name="polars"):
926
+ try:
927
+ import polars as pl
928
+
929
+ if len(parquet_paths) == 1:
930
+ # Single file
931
+ return pl.read_parquet(parquet_paths[0])
932
+ else:
933
+ # Multiple files: concatenate them
934
+ dfs = [pl.read_parquet(path) for path in parquet_paths]
935
+ return pl.concat(dfs, how="vertical_relaxed")
936
+ except Exception as e:
937
+ # If Polars fails, try Pandas if available
938
+ if _is_lib_present(lib_name="pandas"):
939
+ import pandas as pd
940
+
941
+ if len(parquet_paths) == 1:
942
+ return pd.read_parquet(parquet_paths[0])
943
+ else:
944
+ # Multiple files: concatenate them
945
+ dfs = [pd.read_parquet(path) for path in parquet_paths]
946
+ return pd.concat(dfs, ignore_index=True)
947
+ else:
948
+ raise RuntimeError(
949
+ f"Failed to read Parquet file(s) with Polars: {e}. "
950
+ "Pandas is not available as fallback."
951
+ ) from e
952
+ elif _is_lib_present(lib_name="pandas"):
953
+ try:
954
+ import pandas as pd
955
+
956
+ if len(parquet_paths) == 1:
957
+ return pd.read_parquet(parquet_paths[0])
958
+ else:
959
+ # Multiple files: concatenate them
960
+ dfs = [pd.read_parquet(path) for path in parquet_paths]
961
+ return pd.concat(dfs, ignore_index=True)
962
+ except Exception as e:
963
+ raise RuntimeError(f"Failed to read Parquet file(s) with Pandas: {e}") from e
964
+ else:
965
+ raise ImportError(
966
+ "Neither Polars nor Pandas is available for reading Parquet files. "
967
+ "Please install either 'polars' or 'pandas' to use Parquet file inputs."
968
+ )
969
+
970
+
563
971
  def preview(
564
972
  data: FrameT | Any,
565
973
  columns_subset: str | list[str] | Column | None = None,
@@ -590,8 +998,14 @@ def preview(
590
998
  Parameters
591
999
  ----------
592
1000
  data
593
- The table to preview, which could be a DataFrame object or an Ibis table object. Read the
594
- *Supported Input Table Types* section for details on the supported table types.
1001
+ The table to preview, which could be a DataFrame object, an Ibis table object, a CSV
1002
+ file path, a Parquet file path, or a database connection string. When providing a CSV or
1003
+ Parquet file path (as a string or `pathlib.Path` object), the file will be automatically
1004
+ loaded using an available DataFrame library (Polars or Pandas). Parquet input also supports
1005
+ glob patterns, directories containing .parquet files, and Spark-style partitioned datasets.
1006
+ Connection strings enable direct database access via Ibis with optional table specification
1007
+ using the `::table_name` suffix. Read the *Supported Input Table Types* section for details
1008
+ on the supported table types.
595
1009
  columns_subset
596
1010
  The columns to display in the table, by default `None` (all columns are shown). This can
597
1011
  be a string, a list of strings, a `Column` object, or a `ColumnSelector` object. The latter
@@ -636,13 +1050,40 @@ def preview(
636
1050
  - MySQL table (`"mysql"`)*
637
1051
  - PostgreSQL table (`"postgresql"`)*
638
1052
  - SQLite table (`"sqlite"`)*
1053
+ - Microsoft SQL Server table (`"mssql"`)*
1054
+ - Snowflake table (`"snowflake"`)*
1055
+ - Databricks table (`"databricks"`)*
1056
+ - PySpark table (`"pyspark"`)*
1057
+ - BigQuery table (`"bigquery"`)*
639
1058
  - Parquet table (`"parquet"`)*
1059
+ - CSV files (string path or `pathlib.Path` object with `.csv` extension)
1060
+ - Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
1061
+ extension, or partitioned dataset)
1062
+ - Database connection strings (URI format with optional table specification)
640
1063
 
641
1064
  The table types marked with an asterisk need to be prepared as Ibis tables (with type of
642
1065
  `ibis.expr.types.relations.Table`). Furthermore, using `preview()` with these types of tables
643
1066
  requires the Ibis library (`v9.5.0` or above) to be installed. If the input table is a Polars or
644
1067
  Pandas DataFrame, the availability of Ibis is not needed.
645
1068
 
1069
+ To use a CSV file, ensure that a string or `pathlib.Path` object with a `.csv` extension is
1070
+ provided. The file will be automatically detected and loaded using the best available DataFrame
1071
+ library. The loading preference is Polars first, then Pandas as a fallback.
1072
+
1073
+ Connection strings follow database URL formats and must also specify a table using the
1074
+ `::table_name` suffix. Examples include:
1075
+
1076
+ ```
1077
+ "duckdb:///path/to/database.ddb::table_name"
1078
+ "sqlite:///path/to/database.db::table_name"
1079
+ "postgresql://user:password@localhost:5432/database::table_name"
1080
+ "mysql://user:password@localhost:3306/database::table_name"
1081
+ "bigquery://project/dataset::table_name"
1082
+ "snowflake://user:password@account/database/schema::table_name"
1083
+ ```
1084
+
1085
+ When using connection strings, the Ibis library with the appropriate backend driver is required.
1086
+
646
1087
  Examples
647
1088
  --------
648
1089
  It's easy to preview a table using the `preview()` function. Here's an example using the
@@ -709,8 +1150,80 @@ def preview(
709
1150
  columns_subset=pb.col(pb.starts_with("item") | pb.matches("player"))
710
1151
  )
711
1152
  ```
1153
+
1154
+ ### Working with CSV Files
1155
+
1156
+ The `preview()` function can directly accept CSV file paths, making it easy to preview data
1157
+ stored in CSV files without manual loading:
1158
+
1159
+ ```{python}
1160
+ # Get a path to a CSV file from the package data
1161
+ csv_path = pb.get_data_path("global_sales", "csv")
1162
+
1163
+ pb.preview(csv_path)
1164
+ ```
1165
+
1166
+ You can also use a Path object to specify the CSV file:
1167
+
1168
+ ```{python}
1169
+ from pathlib import Path
1170
+
1171
+ csv_file = Path(pb.get_data_path("game_revenue", "csv"))
1172
+
1173
+ pb.preview(csv_file, n_head=3, n_tail=3)
1174
+ ```
1175
+
1176
+ ### Working with Parquet Files
1177
+
1178
+ The `preview()` function can directly accept Parquet files and datasets in various formats:
1179
+
1180
+ ```{python}
1181
+ # Single Parquet file from package data
1182
+ parquet_path = pb.get_data_path("nycflights", "parquet")
1183
+
1184
+ pb.preview(parquet_path)
1185
+ ```
1186
+
1187
+ You can also use glob patterns and directories:
1188
+
1189
+ ```python
1190
+ # Multiple Parquet files with glob patterns
1191
+ pb.preview("data/sales_*.parquet")
1192
+
1193
+ # Directory containing Parquet files
1194
+ pb.preview("parquet_data/")
1195
+
1196
+ # Partitioned Parquet dataset
1197
+ pb.preview("sales_data/") # Auto-discovers partition columns
1198
+ ```
1199
+
1200
+ ### Working with Database Connection Strings
1201
+
1202
+ The `preview()` function supports database connection strings for direct preview of database
1203
+ tables. Connection strings must specify a table using the `::table_name` suffix:
1204
+
1205
+ ```{python}
1206
+ # Get path to a DuckDB database file from package data
1207
+ duckdb_path = pb.get_data_path("game_revenue", "duckdb")
1208
+
1209
+ pb.preview(f"duckdb:///{duckdb_path}::game_revenue")
1210
+ ```
1211
+
1212
+ For comprehensive documentation on supported connection string formats, error handling, and
1213
+ installation requirements, see the [`connect_to_table()`](`pointblank.connect_to_table`)
1214
+ function.
712
1215
  """
713
1216
 
1217
+ # Process input data to handle different data source types
1218
+ # Handle connection string input (e.g., "duckdb:///path/to/file.ddb::table_name")
1219
+ data = _process_connection_string(data)
1220
+
1221
+ # Handle CSV file input (e.g., "data.csv" or Path("data.csv"))
1222
+ data = _process_csv_input(data)
1223
+
1224
+ # Handle Parquet file input (e.g., "data.parquet", "data/*.parquet", "data/")
1225
+ data = _process_parquet_input(data)
1226
+
714
1227
  if incl_header is None:
715
1228
  incl_header = global_config.preview_incl_header
716
1229
 
@@ -908,7 +1421,7 @@ def _generate_display_table(
908
1421
  k: v.split("(")[0] if "(" in v else v for k, v in col_dtype_dict.items()
909
1422
  }
910
1423
 
911
- # Create a dictionary of column and row positions where the value is None/NA/NULL
1424
+ # Create a dictionary of column and row positions where the value is None/NA/Null
912
1425
  # This is used to highlight these values in the table
913
1426
  if df_lib_name_gt == "polars":
914
1427
  none_values = {k: data[k].is_null().to_list() for k in col_names}
@@ -932,7 +1445,10 @@ def _generate_display_table(
932
1445
  column_values = gt.gt._get_column_of_values(built_gt, column_name=column, context="html")
933
1446
 
934
1447
  # Get the maximum number of characters in the column
935
- max_length_col_vals.append(max([len(str(val)) for val in column_values]))
1448
+ if column_values: # Check if column_values is not empty
1449
+ max_length_col_vals.append(max([len(str(val)) for val in column_values]))
1450
+ else:
1451
+ max_length_col_vals.append(0) # Use 0 for empty columns
936
1452
 
937
1453
  length_col_names = [len(column) for column in col_dtype_dict.keys()]
938
1454
  length_data_types = [len(dtype) for dtype in col_dtype_dict_short.values()]
@@ -1003,8 +1519,12 @@ def _generate_display_table(
1003
1519
 
1004
1520
  # Get the highest number in the `row_number_list` and calculate a width that will
1005
1521
  # safely fit a number of that magnitude
1006
- max_row_num = max(row_number_list)
1007
- max_row_num_width = len(str(max_row_num)) * 7.8 + 10
1522
+ if row_number_list: # Check if list is not empty
1523
+ max_row_num = max(row_number_list)
1524
+ max_row_num_width = len(str(max_row_num)) * 7.8 + 10
1525
+ else:
1526
+ # If row_number_list is empty, use a default width
1527
+ max_row_num_width = 7.8 * 2 + 10 # Width for 2-digit numbers
1008
1528
 
1009
1529
  # Update the col_width_dict to include the row number column
1010
1530
  col_width_dict = {"_row_num_": f"{max_row_num_width}px"} | col_width_dict
@@ -1134,6 +1654,11 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
1134
1654
  - MySQL table (`"mysql"`)*
1135
1655
  - PostgreSQL table (`"postgresql"`)*
1136
1656
  - SQLite table (`"sqlite"`)*
1657
+ - Microsoft SQL Server table (`"mssql"`)*
1658
+ - Snowflake table (`"snowflake"`)*
1659
+ - Databricks table (`"databricks"`)*
1660
+ - PySpark table (`"pyspark"`)*
1661
+ - BigQuery table (`"bigquery"`)*
1137
1662
  - Parquet table (`"parquet"`)*
1138
1663
 
1139
1664
  The table types marked with an asterisk need to be prepared as Ibis tables (with type of
@@ -1663,6 +2188,11 @@ def get_column_count(data: FrameT | Any) -> int:
1663
2188
  - MySQL table (`"mysql"`)*
1664
2189
  - PostgreSQL table (`"postgresql"`)*
1665
2190
  - SQLite table (`"sqlite"`)*
2191
+ - Microsoft SQL Server table (`"mssql"`)*
2192
+ - Snowflake table (`"snowflake"`)*
2193
+ - Databricks table (`"databricks"`)*
2194
+ - PySpark table (`"pyspark"`)*
2195
+ - BigQuery table (`"bigquery"`)*
1666
2196
  - Parquet table (`"parquet"`)*
1667
2197
 
1668
2198
  The table types marked with an asterisk need to be prepared as Ibis tables (with type of
@@ -1707,6 +2237,9 @@ def get_column_count(data: FrameT | Any) -> int:
1707
2237
  elif "pandas" in str(type(data)):
1708
2238
  return data.shape[1]
1709
2239
 
2240
+ elif "narwhals" in str(type(data)):
2241
+ return len(data.columns)
2242
+
1710
2243
  else:
1711
2244
  raise ValueError("The input table type supplied in `data=` is not supported.")
1712
2245
 
@@ -1741,6 +2274,11 @@ def get_row_count(data: FrameT | Any) -> int:
1741
2274
  - MySQL table (`"mysql"`)*
1742
2275
  - PostgreSQL table (`"postgresql"`)*
1743
2276
  - SQLite table (`"sqlite"`)*
2277
+ - Microsoft SQL Server table (`"mssql"`)*
2278
+ - Snowflake table (`"snowflake"`)*
2279
+ - Databricks table (`"databricks"`)*
2280
+ - PySpark table (`"pyspark"`)*
2281
+ - BigQuery table (`"bigquery"`)*
1744
2282
  - Parquet table (`"parquet"`)*
1745
2283
 
1746
2284
  The table types marked with an asterisk need to be prepared as Ibis tables (with type of
@@ -1795,6 +2333,9 @@ def get_row_count(data: FrameT | Any) -> int:
1795
2333
  elif "pandas" in str(type(data)):
1796
2334
  return data.shape[0]
1797
2335
 
2336
+ elif "narwhals" in str(type(data)):
2337
+ return data.shape[0]
2338
+
1798
2339
  else:
1799
2340
  raise ValueError("The input table type supplied in `data=` is not supported.")
1800
2341
 
@@ -1910,6 +2451,239 @@ class _ValidationInfo:
1910
2451
  return self.val_info
1911
2452
 
1912
2453
 
2454
+ def connect_to_table(connection_string: str) -> Any:
2455
+ """
2456
+ Connect to a database table using a connection string.
2457
+
2458
+ This utility function tests whether a connection string leads to a valid table and returns
2459
+ the table object if successful. It provides helpful error messages when no table is specified
2460
+ or when backend dependencies are missing.
2461
+
2462
+ Parameters
2463
+ ----------
2464
+ connection_string
2465
+ A database connection string with a required table specification using the `::table_name`
2466
+ suffix. Supported formats are outlined in the *Supported Connection String Formats* section.
2467
+
2468
+ Returns
2469
+ -------
2470
+ Any
2471
+ An Ibis table object for the specified database table.
2472
+
2473
+ Supported Connection String Formats
2474
+ -----------------------------------
2475
+ The `connection_string` parameter must include a valid connection string with a table name
2476
+ specified using the `::` syntax. Here are some examples on how to format connection strings
2477
+ for various backends:
2478
+
2479
+ ```
2480
+ DuckDB: "duckdb:///path/to/database.ddb::table_name"
2481
+ SQLite: "sqlite:///path/to/database.db::table_name"
2482
+ PostgreSQL: "postgresql://user:password@localhost:5432/database::table_name"
2483
+ MySQL: "mysql://user:password@localhost:3306/database::table_name"
2484
+ BigQuery: "bigquery://project/dataset::table_name"
2485
+ Snowflake: "snowflake://user:password@account/database/schema::table_name"
2486
+ ```
2487
+
2488
+ If the connection string does not include a table name, the function will attempt to connect to
2489
+ the database and list available tables, providing guidance on how to specify a table.
2490
+
2491
+ Examples
2492
+ --------
2493
+ Connect to a DuckDB table:
2494
+
2495
+ ```{python}
2496
+ import pointblank as pb
2497
+
2498
+ # Get path to a DuckDB database file from package data
2499
+ duckdb_path = pb.get_data_path("game_revenue", "duckdb")
2500
+
2501
+ # Connect to the `game_revenue` table in the DuckDB database
2502
+ game_revenue = pb.connect_to_table(f"duckdb:///{duckdb_path}::game_revenue")
2503
+
2504
+ # Use with the `preview()` function
2505
+ pb.preview(game_revenue)
2506
+ ```
2507
+
2508
+ Here are some backend-specific connection examples:
2509
+
2510
+ ```python
2511
+ # PostgreSQL
2512
+ pg_table = pb.connect_to_table(
2513
+ "postgresql://user:password@localhost:5432/warehouse::customer_data"
2514
+ )
2515
+
2516
+ # SQLite
2517
+ sqlite_table = pb.connect_to_table("sqlite:///local_data.db::products")
2518
+
2519
+ # BigQuery
2520
+ bq_table = pb.connect_to_table("bigquery://my-project/analytics::daily_metrics")
2521
+ ```
2522
+
2523
+ This function requires the Ibis library with appropriate backend drivers:
2524
+
2525
+ ```bash
2526
+ # You can install a set of common backends:
2527
+ pip install 'ibis-framework[duckdb,postgres,mysql,sqlite]'
2528
+
2529
+ # ...or specific backends as needed:
2530
+ pip install 'ibis-framework[duckdb]' # for DuckDB
2531
+ pip install 'ibis-framework[postgres]' # for PostgreSQL
2532
+ ```
2533
+ """
2534
+ # Check if Ibis is available
2535
+ if not _is_lib_present(lib_name="ibis"):
2536
+ raise ImportError(
2537
+ "The Ibis library is not installed but is required for database connection strings.\n"
2538
+ "Install it with: pip install 'ibis-framework[duckdb]' (or other backend as needed)"
2539
+ )
2540
+
2541
+ import ibis
2542
+
2543
+ # Check if connection string includes table specification
2544
+ if "::" not in connection_string:
2545
+ # Try to connect to get available tables for helpful error message
2546
+ try:
2547
+ # Extract the base connection string (without table name)
2548
+ base_connection = connection_string
2549
+
2550
+ # Connect to the database
2551
+ conn = ibis.connect(base_connection)
2552
+
2553
+ # Get list of available tables
2554
+ try:
2555
+ available_tables = conn.list_tables()
2556
+ except Exception:
2557
+ available_tables = []
2558
+
2559
+ conn.disconnect()
2560
+
2561
+ # Create helpful error message
2562
+ if available_tables:
2563
+ table_list = "\n".join(f" - {table}" for table in available_tables)
2564
+ error_msg = (
2565
+ f"No table specified in connection string: {connection_string}\n\n"
2566
+ f"Available tables in the database:\n{table_list}\n\n"
2567
+ f"To access a specific table, use the format:\n"
2568
+ f" {connection_string}::TABLE_NAME\n\n"
2569
+ f"Examples:\n"
2570
+ )
2571
+ # Add examples with first few table names
2572
+ for table in available_tables[:3]:
2573
+ error_msg += f" {connection_string}::{table}\n"
2574
+ else:
2575
+ error_msg = (
2576
+ f"No table specified in connection string: {connection_string}\n\n"
2577
+ f"No tables found in the database or unable to list tables.\n\n"
2578
+ f"To access a specific table, use the format:\n"
2579
+ f" {connection_string}::TABLE_NAME"
2580
+ )
2581
+
2582
+ raise ValueError(error_msg)
2583
+
2584
+ except Exception as e:
2585
+ if isinstance(e, ValueError):
2586
+ raise # Re-raise our custom ValueError
2587
+
2588
+ # Check for backend-specific errors and provide installation guidance
2589
+ error_str = str(e).lower()
2590
+ backend_install_map = {
2591
+ "duckdb": "pip install 'ibis-framework[duckdb]'",
2592
+ "postgresql": "pip install 'ibis-framework[postgres]'",
2593
+ "postgres": "pip install 'ibis-framework[postgres]'",
2594
+ "mysql": "pip install 'ibis-framework[mysql]'",
2595
+ "sqlite": "pip install 'ibis-framework[sqlite]'",
2596
+ "bigquery": "pip install 'ibis-framework[bigquery]'",
2597
+ "snowflake": "pip install 'ibis-framework[snowflake]'",
2598
+ }
2599
+
2600
+ # Check if this is a missing backend dependency
2601
+ for backend, install_cmd in backend_install_map.items():
2602
+ if backend in error_str and ("not found" in error_str or "no module" in error_str):
2603
+ raise ConnectionError(
2604
+ f"Missing {backend.upper()} backend for Ibis. Install it with:\n"
2605
+ f" {install_cmd}\n\n"
2606
+ f"Original error: {e}\n\n"
2607
+ f"Supported connection string formats:\n"
2608
+ f"- DuckDB: 'duckdb:///path/to/file.ddb::table_name'\n"
2609
+ f"- SQLite: 'sqlite:///path/to/file.db::table_name'\n"
2610
+ f"- PostgreSQL: 'postgresql://user:pass@host:port/db::table_name'\n"
2611
+ f"- MySQL: 'mysql://user:pass@host:port/db::table_name'\n"
2612
+ f"- BigQuery: 'bigquery://project/dataset::table_name'\n"
2613
+ f"- Snowflake: 'snowflake://user:pass@account/db/schema::table_name'\n"
2614
+ f"\nNote: Use '::table_name' to specify the table within the database."
2615
+ ) from e
2616
+
2617
+ # Generic connection error
2618
+ raise ConnectionError(
2619
+ f"Failed to connect to database using connection string: {connection_string}\n"
2620
+ f"Error: {e}\n\n"
2621
+ f"No table specified. Use the format: {connection_string}::TABLE_NAME"
2622
+ ) from e
2623
+
2624
+ # Split connection string and table name
2625
+ try:
2626
+ base_connection, table_name = connection_string.rsplit("::", 1)
2627
+ except ValueError:
2628
+ raise ValueError(f"Invalid connection string format: {connection_string}")
2629
+
2630
+ # Connect to database and get table
2631
+ try:
2632
+ conn = ibis.connect(base_connection)
2633
+ table = conn.table(table_name)
2634
+ return table
2635
+
2636
+ except Exception as e:
2637
+ # Check for backend-specific errors and provide installation guidance
2638
+ error_str = str(e).lower()
2639
+ backend_install_map = {
2640
+ "duckdb": "pip install 'ibis-framework[duckdb]'",
2641
+ "postgresql": "pip install 'ibis-framework[postgres]'",
2642
+ "postgres": "pip install 'ibis-framework[postgres]'",
2643
+ "mysql": "pip install 'ibis-framework[mysql]'",
2644
+ "sqlite": "pip install 'ibis-framework[sqlite]'",
2645
+ "bigquery": "pip install 'ibis-framework[bigquery]'",
2646
+ "snowflake": "pip install 'ibis-framework[snowflake]'",
2647
+ }
2648
+
2649
+ # Check if this is a missing backend dependency
2650
+ for backend, install_cmd in backend_install_map.items():
2651
+ if backend in error_str and ("not found" in error_str or "no module" in error_str):
2652
+ raise ConnectionError(
2653
+ f"Missing {backend.upper()} backend for Ibis. Install it with:\n"
2654
+ f" {install_cmd}\n\n"
2655
+ f"Original error: {e}"
2656
+ ) from e
2657
+
2658
+ # Check if table doesn't exist
2659
+ if "table" in error_str and ("not found" in error_str or "does not exist" in error_str):
2660
+ # Try to get available tables for helpful message
2661
+ try:
2662
+ available_tables = conn.list_tables()
2663
+ if available_tables:
2664
+ table_list = "\n".join(f" - {table}" for table in available_tables)
2665
+ raise ValueError(
2666
+ f"Table '{table_name}' not found in database.\n\n"
2667
+ f"Available tables:\n{table_list}\n\n"
2668
+ f"Check the table name and try again with:\n"
2669
+ f" {base_connection}::CORRECT_TABLE_NAME"
2670
+ ) from e
2671
+ else:
2672
+ raise ValueError(
2673
+ f"Table '{table_name}' not found and no tables available in database."
2674
+ ) from e
2675
+ except Exception:
2676
+ raise ValueError(
2677
+ f"Table '{table_name}' not found in database. "
2678
+ f"Check the table name and connection string."
2679
+ ) from e
2680
+
2681
+ # Generic connection error
2682
+ raise ConnectionError(
2683
+ f"Failed to connect to table '{table_name}' using: {base_connection}\nError: {e}"
2684
+ ) from e
2685
+
2686
+
1913
2687
  @dataclass
1914
2688
  class Validate:
1915
2689
  """
@@ -1942,8 +2716,14 @@ class Validate:
1942
2716
  Parameters
1943
2717
  ----------
1944
2718
  data
1945
- The table to validate, which could be a DataFrame object or an Ibis table object. Read the
1946
- *Supported Input Table Types* section for details on the supported table types.
2719
+ The table to validate, which could be a DataFrame object, an Ibis table object, a CSV
2720
+ file path, a Parquet file path, or a database connection string. When providing a CSV or
2721
+ Parquet file path (as a string or `pathlib.Path` object), the file will be automatically
2722
+ loaded using an available DataFrame library (Polars or Pandas). Parquet input also supports
2723
+ glob patterns, directories containing .parquet files, and Spark-style partitioned datasets.
2724
+ Connection strings enable direct database access via Ibis with optional table specification
2725
+ using the `::table_name` suffix. Read the *Supported Input Table Types* section for details
2726
+ on the supported table types.
1947
2727
  tbl_name
1948
2728
  An optional name to assign to the input table object. If no value is provided, a name will
1949
2729
  be generated based on whatever information is available. This table name will be displayed
@@ -2007,13 +2787,40 @@ class Validate:
2007
2787
  - MySQL table (`"mysql"`)*
2008
2788
  - PostgreSQL table (`"postgresql"`)*
2009
2789
  - SQLite table (`"sqlite"`)*
2790
+ - Microsoft SQL Server table (`"mssql"`)*
2791
+ - Snowflake table (`"snowflake"`)*
2792
+ - Databricks table (`"databricks"`)*
2793
+ - PySpark table (`"pyspark"`)*
2794
+ - BigQuery table (`"bigquery"`)*
2010
2795
  - Parquet table (`"parquet"`)*
2796
+ - CSV files (string path or `pathlib.Path` object with `.csv` extension)
2797
+ - Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
2798
+ extension, or partitioned dataset)
2799
+ - Database connection strings (URI format with optional table specification)
2011
2800
 
2012
2801
  The table types marked with an asterisk need to be prepared as Ibis tables (with type of
2013
2802
  `ibis.expr.types.relations.Table`). Furthermore, the use of `Validate` with such tables requires
2014
2803
  the Ibis library v9.5.0 and above to be installed. If the input table is a Polars or Pandas
2015
2804
  DataFrame, the Ibis library is not required.
2016
2805
 
2806
+ To use a CSV file, ensure that a string or `pathlib.Path` object with a `.csv` extension is
2807
+ provided. The file will be automatically detected and loaded using the best available DataFrame
2808
+ library. The loading preference is Polars first, then Pandas as a fallback.
2809
+
2810
+ Connection strings follow database URL formats and must also specify a table using the
2811
+ `::table_name` suffix. Examples include:
2812
+
2813
+ ```
2814
+ "duckdb:///path/to/database.ddb::table_name"
2815
+ "sqlite:///path/to/database.db::table_name"
2816
+ "postgresql://user:password@localhost:5432/database::table_name"
2817
+ "mysql://user:password@localhost:3306/database::table_name"
2818
+ "bigquery://project/dataset::table_name"
2819
+ "snowflake://user:password@account/database/schema::table_name"
2820
+ ```
2821
+
2822
+ When using connection strings, the Ibis library with the appropriate backend driver is required.
2823
+
2017
2824
  Thresholds
2018
2825
  ----------
2019
2826
  The `thresholds=` parameter is used to set the failure-condition levels for all validation
@@ -2170,8 +2977,8 @@ class Validate:
2170
2977
  ```{python}
2171
2978
  import pointblank as pb
2172
2979
 
2173
- # Load the small_table dataset
2174
- small_table = pb.load_dataset()
2980
+ # Load the `small_table` dataset
2981
+ small_table = pb.load_dataset(dataset="small_table", tbl_type="polars")
2175
2982
 
2176
2983
  # Preview the table
2177
2984
  pb.preview(small_table)
@@ -2237,7 +3044,7 @@ class Validate:
2237
3044
  brief). Here's an example of a global setting for briefs:
2238
3045
 
2239
3046
  ```{python}
2240
- validation = (
3047
+ validation_2 = (
2241
3048
  pb.Validate(
2242
3049
  data=pb.load_dataset(),
2243
3050
  tbl_name="small_table",
@@ -2254,7 +3061,7 @@ class Validate:
2254
3061
  .interrogate()
2255
3062
  )
2256
3063
 
2257
- validation
3064
+ validation_2
2258
3065
  ```
2259
3066
 
2260
3067
  We see the text of the briefs appear in the `STEP` column of the reporting table. Furthermore,
@@ -2272,7 +3079,7 @@ class Validate:
2272
3079
  the data extracts for each validation step.
2273
3080
 
2274
3081
  ```{python}
2275
- validation.get_data_extracts()
3082
+ validation_2.get_data_extracts()
2276
3083
  ```
2277
3084
 
2278
3085
  We can also view step reports for each validation step using the
@@ -2280,7 +3087,7 @@ class Validate:
2280
3087
  type of validation step and shows the relevant information for a step's validation.
2281
3088
 
2282
3089
  ```{python}
2283
- validation.get_step_report(i=2)
3090
+ validation_2.get_step_report(i=2)
2284
3091
  ```
2285
3092
 
2286
3093
  The `Validate` class also has a method for getting the sundered data, which is the data that
@@ -2288,11 +3095,141 @@ class Validate:
2288
3095
  [`get_sundered_data()`](`pointblank.Validate.get_sundered_data`) method.
2289
3096
 
2290
3097
  ```{python}
2291
- pb.preview(validation.get_sundered_data())
3098
+ pb.preview(validation_2.get_sundered_data())
2292
3099
  ```
2293
3100
 
2294
3101
  The sundered data is a DataFrame that contains the rows that passed or failed the validation.
2295
3102
  The default behavior is to return the rows that failed the validation, as shown above.
3103
+
3104
+ ### Working with CSV Files
3105
+
3106
+ The `Validate` class can directly accept CSV file paths, making it easy to validate data stored
3107
+ in CSV files without manual loading:
3108
+
3109
+ ```{python}
3110
+ # Get a path to a CSV file from the package data
3111
+ csv_path = pb.get_data_path("global_sales", "csv")
3112
+
3113
+ validation_3 = (
3114
+ pb.Validate(
3115
+ data=csv_path,
3116
+ label="CSV validation example"
3117
+ )
3118
+ .col_exists(["customer_id", "product_id", "revenue"])
3119
+ .col_vals_not_null(["customer_id", "product_id"])
3120
+ .col_vals_gt(columns="revenue", value=0)
3121
+ .interrogate()
3122
+ )
3123
+
3124
+ validation_3
3125
+ ```
3126
+
3127
+ You can also use a Path object to specify the CSV file. Here's an example of how to do that:
3128
+
3129
+ ```{python}
3130
+ from pathlib import Path
3131
+
3132
+ csv_file = Path(pb.get_data_path("game_revenue", "csv"))
3133
+
3134
+ validation_4 = (
3135
+ pb.Validate(data=csv_file, label="Game Revenue Validation")
3136
+ .col_exists(["player_id", "session_id", "item_name"])
3137
+ .col_vals_regex(
3138
+ columns="session_id",
3139
+ pattern=r"[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}"
3140
+ )
3141
+ .col_vals_gt(columns="item_revenue", value=0, na_pass=True)
3142
+ .interrogate()
3143
+ )
3144
+
3145
+ validation_4
3146
+ ```
3147
+
3148
+ The CSV loading is automatic, so when a string or Path with a `.csv` extension is provided,
3149
+ Pointblank will automatically load the file using the best available DataFrame library (Polars
3150
+ preferred, Pandas as fallback). The loaded data can then be used with all validation methods
3151
+ just like any other supported table type.
3152
+
3153
+ ### Working with Parquet Files
3154
+
3155
+ The `Validate` class can directly accept Parquet files and datasets in various formats. The
3156
+ following examples illustrate how to validate Parquet files:
3157
+
3158
+ ```{python}
3159
+ # Single Parquet file from package data
3160
+ parquet_path = pb.get_data_path("nycflights", "parquet")
3161
+
3162
+ validation_5 = (
3163
+ pb.Validate(
3164
+ data=parquet_path,
3165
+ tbl_name="NYC Flights Data"
3166
+ )
3167
+ .col_vals_not_null(["carrier", "origin", "dest"])
3168
+ .col_vals_gt(columns="distance", value=0)
3169
+ .interrogate()
3170
+ )
3171
+
3172
+ validation_5
3173
+ ```
3174
+
3175
+ You can also use glob patterns and directories. Here are some examples for how to:
3176
+
3177
+ 1. load multiple Parquet files
3178
+ 2. load a Parquet-containing directory
3179
+ 3. load a partitioned Parquet dataset
3180
+
3181
+ ```python
3182
+ # Multiple Parquet files with glob patterns
3183
+ validation_6 = pb.Validate(data="data/sales_*.parquet")
3184
+
3185
+ # Directory containing Parquet files
3186
+ validation_7 = pb.Validate(data="parquet_data/")
3187
+
3188
+ # Partitioned Parquet dataset
3189
+ validation_8 = (
3190
+ pb.Validate(data="sales_data/") # Contains year=2023/quarter=Q1/region=US/sales.parquet
3191
+ .col_exists(["transaction_id", "amount", "year", "quarter", "region"])
3192
+ .interrogate()
3193
+ )
3194
+ ```
3195
+
3196
+ When you point to a directory that contains a partitioned Parquet dataset (with subdirectories
3197
+ like `year=2023/quarter=Q1/region=US/`), Pointblank will automatically:
3198
+
3199
+ - discover all Parquet files recursively
3200
+ - extract partition column values from directory paths
3201
+ - add partition columns to the final DataFrame
3202
+ - combine all partitions into a single table for validation
3203
+
3204
+ Both Polars and Pandas handle partitioned datasets natively, so this works seamlessly with
3205
+ either DataFrame library. The loading preference is Polars first, then Pandas as a fallback.
3206
+
3207
+ ### Working with Database Connection Strings
3208
+
3209
+ The `Validate` class supports database connection strings for direct validation of database
3210
+ tables. Connection strings must specify a table using the `::table_name` suffix:
3211
+
3212
+ ```{python}
3213
+ # Get path to a DuckDB database file from package data
3214
+ duckdb_path = pb.get_data_path("game_revenue", "duckdb")
3215
+
3216
+ validation_9 = (
3217
+ pb.Validate(
3218
+ data=f"duckdb:///{duckdb_path}::game_revenue",
3219
+ label="DuckDB Game Revenue Validation"
3220
+ )
3221
+ .col_exists(["player_id", "session_id", "item_revenue"])
3222
+ .col_vals_gt(columns="item_revenue", value=0)
3223
+ .interrogate()
3224
+ )
3225
+
3226
+ validation_9
3227
+ ```
3228
+
3229
+ For comprehensive documentation on supported connection string formats, error handling, and
3230
+ installation requirements, see the [`connect_to_table()`](`pointblank.connect_to_table`)
3231
+ function. This function handles all the connection logic and provides helpful error messages
3232
+ when table specifications are missing or backend dependencies are not installed.
2296
3233
  """
2297
3234
 
2298
3235
  data: FrameT | Any
@@ -2306,6 +3243,15 @@ class Validate:
2306
3243
  locale: str | None = None
2307
3244
 
2308
3245
  def __post_init__(self):
3246
+ # Handle connection string input for the data parameter
3247
+ self.data = _process_connection_string(self.data)
3248
+
3249
+ # Handle CSV file input for the data parameter
3250
+ self.data = _process_csv_input(self.data)
3251
+
3252
+ # Handle Parquet file input for the data parameter
3253
+ self.data = _process_parquet_input(self.data)
3254
+
2309
3255
  # Check input of the `thresholds=` argument
2310
3256
  _check_thresholds(thresholds=self.thresholds)
2311
3257
 
@@ -2481,12 +3427,16 @@ class Validate:
2481
3427
  (i.e., no validation steps will be created for them).
2482
3428
 
2483
3429
  A list with a combination of column names and tuples can be provided as well. This allows
2484
- for more complex segmentation scenarios. The following inputs are all valid:
3430
+ for more complex segmentation scenarios. The following inputs are both valid:
2485
3431
 
2486
- - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
2487
- in the `"region"` column and specific dates in the `"date"` column
2488
- - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
2489
- columns
3432
+ ```
3433
+ # Segments from all unique values in the `region` column
3434
+ # and specific dates in the `date` column
3435
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
3436
+
3437
+ # Segments from all unique values in the `region` and `date` columns
3438
+ segments=["region", "date"]
3439
+ ```
2490
3440
 
2491
3441
  The segmentation is performed during interrogation, and the resulting validation steps will
2492
3442
  be numbered sequentially. Each segment will have its own validation step, and the results
@@ -2769,12 +3719,16 @@ class Validate:
2769
3719
  (i.e., no validation steps will be created for them).
2770
3720
 
2771
3721
  A list with a combination of column names and tuples can be provided as well. This allows
2772
- for more complex segmentation scenarios. The following inputs are all valid:
3722
+ for more complex segmentation scenarios. The following inputs are both valid:
2773
3723
 
2774
- - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
2775
- in the `"region"` column and specific dates in the `"date"` column
2776
- - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
2777
- columns
3724
+ ```
3725
+ # Segments from all unique values in the `region` column
3726
+ # and specific dates in the `date` column
3727
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
3728
+
3729
+ # Segments from all unique values in the `region` and `date` columns
3730
+ segments=["region", "date"]
3731
+ ```
2778
3732
 
2779
3733
  The segmentation is performed during interrogation, and the resulting validation steps will
2780
3734
  be numbered sequentially. Each segment will have its own validation step, and the results
@@ -3056,12 +4010,16 @@ class Validate:
3056
4010
  (i.e., no validation steps will be created for them).
3057
4011
 
3058
4012
  A list with a combination of column names and tuples can be provided as well. This allows
3059
- for more complex segmentation scenarios. The following inputs are all valid:
4013
+ for more complex segmentation scenarios. The following inputs are both valid:
3060
4014
 
3061
- - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
3062
- in the `"region"` column and specific dates in the `"date"` column
3063
- - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
3064
- columns
4015
+ ```
4016
+ # Segments from all unique values in the `region` column
4017
+ # and specific dates in the `date` column
4018
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
4019
+
4020
+ # Segments from all unique values in the `region` and `date` columns
4021
+ segments=["region", "date"]
4022
+ ```
3065
4023
 
3066
4024
  The segmentation is performed during interrogation, and the resulting validation steps will
3067
4025
  be numbered sequentially. Each segment will have its own validation step, and the results
@@ -3342,12 +4300,16 @@ class Validate:
3342
4300
  (i.e., no validation steps will be created for them).
3343
4301
 
3344
4302
  A list with a combination of column names and tuples can be provided as well. This allows
3345
- for more complex segmentation scenarios. The following inputs are all valid:
4303
+ for more complex segmentation scenarios. The following inputs are both valid:
3346
4304
 
3347
- - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
3348
- in the `"region"` column and specific dates in the `"date"` column
3349
- - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
3350
- columns
4305
+ ```
4306
+ # Segments from all unique values in the `region` column
4307
+ # and specific dates in the `date` column
4308
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
4309
+
4310
+ # Segments from all unique values in the `region` and `date` columns
4311
+ segments=["region", "date"]
4312
+ ```
3351
4313
 
3352
4314
  The segmentation is performed during interrogation, and the resulting validation steps will
3353
4315
  be numbered sequentially. Each segment will have its own validation step, and the results
@@ -3626,12 +4588,16 @@ class Validate:
3626
4588
  (i.e., no validation steps will be created for them).
3627
4589
 
3628
4590
  A list with a combination of column names and tuples can be provided as well. This allows
3629
- for more complex segmentation scenarios. The following inputs are all valid:
4591
+ for more complex segmentation scenarios. The following inputs are both valid:
3630
4592
 
3631
- - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
3632
- in the `"region"` column and specific dates in the `"date"` column
3633
- - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
3634
- columns
4593
+ ```
4594
+ # Segments from all unique values in the `region` column
4595
+ # and specific dates in the `date` column
4596
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
4597
+
4598
+ # Segments from all unique values in the `region` and `date` columns
4599
+ segments=["region", "date"]
4600
+ ```
3635
4601
 
3636
4602
  The segmentation is performed during interrogation, and the resulting validation steps will
3637
4603
  be numbered sequentially. Each segment will have its own validation step, and the results
@@ -3914,12 +4880,16 @@ class Validate:
3914
4880
  (i.e., no validation steps will be created for them).
3915
4881
 
3916
4882
  A list with a combination of column names and tuples can be provided as well. This allows
3917
- for more complex segmentation scenarios. The following inputs are all valid:
4883
+ for more complex segmentation scenarios. The following inputs are both valid:
3918
4884
 
3919
- - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
3920
- in the `"region"` column and specific dates in the `"date"` column
3921
- - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
3922
- columns
4885
+ ```
4886
+ # Segments from all unique values in the `region` column
4887
+ # and specific dates in the `date` column
4888
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
4889
+
4890
+ # Segments from all unique values in the `region` and `date` columns
4891
+ segments=["region", "date"]
4892
+ ```
3923
4893
 
3924
4894
  The segmentation is performed during interrogation, and the resulting validation steps will
3925
4895
  be numbered sequentially. Each segment will have its own validation step, and the results
@@ -4216,12 +5186,16 @@ class Validate:
4216
5186
  (i.e., no validation steps will be created for them).
4217
5187
 
4218
5188
  A list with a combination of column names and tuples can be provided as well. This allows
4219
- for more complex segmentation scenarios. The following inputs are all valid:
5189
+ for more complex segmentation scenarios. The following inputs are both valid:
4220
5190
 
4221
- - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
4222
- in the `"region"` column and specific dates in the `"date"` column
4223
- - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
4224
- columns
5191
+ ```
5192
+ # Segments from all unique values in the `region` column
5193
+ # and specific dates in the `date` column
5194
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
5195
+
5196
+ # Segments from all unique values in the `region` and `date` columns
5197
+ segments=["region", "date"]
5198
+ ```
4225
5199
 
4226
5200
  The segmentation is performed during interrogation, and the resulting validation steps will
4227
5201
  be numbered sequentially. Each segment will have its own validation step, and the results
@@ -4532,12 +5506,16 @@ class Validate:
4532
5506
  (i.e., no validation steps will be created for them).
4533
5507
 
4534
5508
  A list with a combination of column names and tuples can be provided as well. This allows
4535
- for more complex segmentation scenarios. The following inputs are all valid:
5509
+ for more complex segmentation scenarios. The following inputs are both valid:
4536
5510
 
4537
- - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
4538
- in the `"region"` column and specific dates in the `"date"` column
4539
- - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
4540
- columns
5511
+ ```
5512
+ # Segments from all unique values in the `region` column
5513
+ # and specific dates in the `date` column
5514
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
5515
+
5516
+ # Segments from all unique values in the `region` and `date` columns
5517
+ segments=["region", "date"]
5518
+ ```
4541
5519
 
4542
5520
  The segmentation is performed during interrogation, and the resulting validation steps will
4543
5521
  be numbered sequentially. Each segment will have its own validation step, and the results
@@ -4804,12 +5782,16 @@ class Validate:
4804
5782
  (i.e., no validation steps will be created for them).
4805
5783
 
4806
5784
  A list with a combination of column names and tuples can be provided as well. This allows
4807
- for more complex segmentation scenarios. The following inputs are all valid:
5785
+ for more complex segmentation scenarios. The following inputs are both valid:
4808
5786
 
4809
- - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
4810
- in the `"region"` column and specific dates in the `"date"` column
4811
- - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
4812
- columns
5787
+ ```
5788
+ # Segments from all unique values in the `region` column
5789
+ # and specific dates in the `date` column
5790
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
5791
+
5792
+ # Segments from all unique values in the `region` and `date` columns
5793
+ segments=["region", "date"]
5794
+ ```
4813
5795
 
4814
5796
  The segmentation is performed during interrogation, and the resulting validation steps will
4815
5797
  be numbered sequentially. Each segment will have its own validation step, and the results
@@ -5057,12 +6039,16 @@ class Validate:
5057
6039
  (i.e., no validation steps will be created for them).
5058
6040
 
5059
6041
  A list with a combination of column names and tuples can be provided as well. This allows
5060
- for more complex segmentation scenarios. The following inputs are all valid:
6042
+ for more complex segmentation scenarios. The following inputs are both valid:
5061
6043
 
5062
- - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
5063
- in the `"region"` column and specific dates in the `"date"` column
5064
- - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
5065
- columns
6044
+ ```
6045
+ # Segments from all unique values in the `region` column
6046
+ # and specific dates in the `date` column
6047
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
6048
+
6049
+ # Segments from all unique values in the `region` and `date` columns
6050
+ segments=["region", "date"]
6051
+ ```
5066
6052
 
5067
6053
  The segmentation is performed during interrogation, and the resulting validation steps will
5068
6054
  be numbered sequentially. Each segment will have its own validation step, and the results
@@ -5218,9 +6204,9 @@ class Validate:
5218
6204
  active: bool = True,
5219
6205
  ) -> Validate:
5220
6206
  """
5221
- Validate whether values in a column are NULL.
6207
+ Validate whether values in a column are Null.
5222
6208
 
5223
- The `col_vals_null()` validation method checks whether column values in a table are NULL.
6209
+ The `col_vals_null()` validation method checks whether column values in a table are Null.
5224
6210
  This validation will operate over the number of test units that is equal to the number
5225
6211
  of rows in the table.
5226
6212
 
@@ -5301,12 +6287,16 @@ class Validate:
5301
6287
  (i.e., no validation steps will be created for them).
5302
6288
 
5303
6289
  A list with a combination of column names and tuples can be provided as well. This allows
5304
- for more complex segmentation scenarios. The following inputs are all valid:
6290
+ for more complex segmentation scenarios. The following inputs are both valid:
5305
6291
 
5306
- - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
5307
- in the `"region"` column and specific dates in the `"date"` column
5308
- - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
5309
- columns
6292
+ ```
6293
+ # Segments from all unique values in the `region` column
6294
+ # and specific dates in the `date` column
6295
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
6296
+
6297
+ # Segments from all unique values in the `region` and `date` columns
6298
+ segments=["region", "date"]
6299
+ ```
5310
6300
 
5311
6301
  The segmentation is performed during interrogation, and the resulting validation steps will
5312
6302
  be numbered sequentially. Each segment will have its own validation step, and the results
@@ -5457,10 +6447,10 @@ class Validate:
5457
6447
  active: bool = True,
5458
6448
  ) -> Validate:
5459
6449
  """
5460
- Validate whether values in a column are not NULL.
6450
+ Validate whether values in a column are not Null.
5461
6451
 
5462
6452
  The `col_vals_not_null()` validation method checks whether column values in a table are not
5463
- NULL. This validation will operate over the number of test units that is equal to the number
6453
+ Null. This validation will operate over the number of test units that is equal to the number
5464
6454
  of rows in the table.
5465
6455
 
5466
6456
  Parameters
@@ -5540,12 +6530,16 @@ class Validate:
5540
6530
  (i.e., no validation steps will be created for them).
5541
6531
 
5542
6532
  A list with a combination of column names and tuples can be provided as well. This allows
5543
- for more complex segmentation scenarios. The following inputs are all valid:
6533
+ for more complex segmentation scenarios. The following inputs are both valid:
5544
6534
 
5545
- - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
5546
- in the `"region"` column and specific dates in the `"date"` column
5547
- - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
5548
- columns
6535
+ ```
6536
+ # Segments from all unique values in the `region` column
6537
+ # and specific dates in the `date` column
6538
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
6539
+
6540
+ # Segments from all unique values in the `region` and `date` columns
6541
+ segments=["region", "date"]
6542
+ ```
5549
6543
 
5550
6544
  The segmentation is performed during interrogation, and the resulting validation steps will
5551
6545
  be numbered sequentially. Each segment will have its own validation step, and the results
@@ -5787,12 +6781,16 @@ class Validate:
5787
6781
  (i.e., no validation steps will be created for them).
5788
6782
 
5789
6783
  A list with a combination of column names and tuples can be provided as well. This allows
5790
- for more complex segmentation scenarios. The following inputs are all valid:
6784
+ for more complex segmentation scenarios. The following inputs are both valid:
5791
6785
 
5792
- - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
5793
- in the `"region"` column and specific dates in the `"date"` column
5794
- - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
5795
- columns
6786
+ ```
6787
+ # Segments from all unique values in the `region` column
6788
+ # and specific dates in the `date` column
6789
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
6790
+
6791
+ # Segments from all unique values in the `region` and `date` columns
6792
+ segments=["region", "date"]
6793
+ ```
5796
6794
 
5797
6795
  The segmentation is performed during interrogation, and the resulting validation steps will
5798
6796
  be numbered sequentially. Each segment will have its own validation step, and the results
@@ -6030,12 +7028,16 @@ class Validate:
6030
7028
  (i.e., no validation steps will be created for them).
6031
7029
 
6032
7030
  A list with a combination of column names and tuples can be provided as well. This allows
6033
- for more complex segmentation scenarios. The following inputs are all valid:
7031
+ for more complex segmentation scenarios. The following inputs are both valid:
6034
7032
 
6035
- - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
6036
- in the `"region"` column and specific dates in the `"date"` column
6037
- - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
6038
- columns
7033
+ ```
7034
+ # Segments from all unique values in the `region` column
7035
+ # and specific dates in the `date` column
7036
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
7037
+
7038
+ # Segments from all unique values in the `region` and `date` columns
7039
+ segments=["region", "date"]
7040
+ ```
6039
7041
 
6040
7042
  The segmentation is performed during interrogation, and the resulting validation steps will
6041
7043
  be numbered sequentially. Each segment will have its own validation step, and the results
@@ -6421,12 +7423,16 @@ class Validate:
6421
7423
  (i.e., no validation steps will be created for them).
6422
7424
 
6423
7425
  A list with a combination of column names and tuples can be provided as well. This allows
6424
- for more complex segmentation scenarios. The following inputs are all valid:
7426
+ for more complex segmentation scenarios. The following inputs are both valid:
6425
7427
 
6426
- - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
6427
- in the `"region"` column and specific dates in the `"date"` column
6428
- - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
6429
- columns
7428
+ ```
7429
+ # Segments from all unique values in the `region` column
7430
+ # and specific dates in the `date` column
7431
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
7432
+
7433
+ # Segments from all unique values in the `region` and `date` columns
7434
+ segments=["region", "date"]
7435
+ ```
6430
7436
 
6431
7437
  The segmentation is performed during interrogation, and the resulting validation steps will
6432
7438
  be numbered sequentially. Each segment will have its own validation step, and the results
@@ -6658,12 +7664,16 @@ class Validate:
6658
7664
  (i.e., no validation steps will be created for them).
6659
7665
 
6660
7666
  A list with a combination of column names and tuples can be provided as well. This allows
6661
- for more complex segmentation scenarios. The following inputs are all valid:
7667
+ for more complex segmentation scenarios. The following inputs are both valid:
6662
7668
 
6663
- - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
6664
- in the `"region"` column and specific dates in the `"date"` column
6665
- - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
6666
- columns
7669
+ ```
7670
+ # Segments from all unique values in the `region` column
7671
+ # and specific dates in the `date` column
7672
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
7673
+
7674
+ # Segments from all unique values in the `region` and `date` columns
7675
+ segments=["region", "date"]
7676
+ ```
6667
7677
 
6668
7678
  The segmentation is performed during interrogation, and the resulting validation steps will
6669
7679
  be numbered sequentially. Each segment will have its own validation step, and the results
@@ -8216,37 +9226,47 @@ class Validate:
8216
9226
 
8217
9227
  # Determine whether any preprocessing functions are to be applied to the table
8218
9228
  if validation.pre is not None:
8219
- # Read the text of the preprocessing function
8220
- pre_text = _pre_processing_funcs_to_str(validation.pre)
9229
+ try:
9230
+ # Read the text of the preprocessing function
9231
+ pre_text = _pre_processing_funcs_to_str(validation.pre)
8221
9232
 
8222
- # Determine if the preprocessing function is a lambda function; return a boolean
8223
- is_lambda = re.match(r"^lambda", pre_text) is not None
9233
+ # Determine if the preprocessing function is a lambda function; return a boolean
9234
+ is_lambda = re.match(r"^lambda", pre_text) is not None
8224
9235
 
8225
- # If the preprocessing function is a lambda function, then check if there is
8226
- # a keyword argument called `dfn` in the lamda signature; if so, that's a cue
8227
- # to use a Narwhalified version of the table
8228
- if is_lambda:
8229
- # Get the signature of the lambda function
8230
- sig = inspect.signature(validation.pre)
9236
+ # If the preprocessing function is a lambda function, then check if there is
9237
+ # a keyword argument called `dfn` in the lamda signature; if so, that's a cue
9238
+ # to use a Narwhalified version of the table
9239
+ if is_lambda:
9240
+ # Get the signature of the lambda function
9241
+ sig = inspect.signature(validation.pre)
8231
9242
 
8232
- # Check if the lambda function has a keyword argument called `dfn`
8233
- if "dfn" in sig.parameters:
8234
- # Convert the table to a Narwhals DataFrame
8235
- data_tbl_step = nw.from_native(data_tbl_step)
9243
+ # Check if the lambda function has a keyword argument called `dfn`
9244
+ if "dfn" in sig.parameters:
9245
+ # Convert the table to a Narwhals DataFrame
9246
+ data_tbl_step = nw.from_native(data_tbl_step)
8236
9247
 
8237
- # Apply the preprocessing function to the table
8238
- data_tbl_step = validation.pre(dfn=data_tbl_step)
9248
+ # Apply the preprocessing function to the table
9249
+ data_tbl_step = validation.pre(dfn=data_tbl_step)
8239
9250
 
8240
- # Convert the table back to its original format
8241
- data_tbl_step = nw.to_native(data_tbl_step)
9251
+ # Convert the table back to its original format
9252
+ data_tbl_step = nw.to_native(data_tbl_step)
8242
9253
 
8243
- else:
8244
- # Apply the preprocessing function to the table
9254
+ else:
9255
+ # Apply the preprocessing function to the table
9256
+ data_tbl_step = validation.pre(data_tbl_step)
9257
+
9258
+ # If the preprocessing function is a function, apply it to the table
9259
+ elif isinstance(validation.pre, Callable):
8245
9260
  data_tbl_step = validation.pre(data_tbl_step)
8246
9261
 
8247
- # If the preprocessing function is a function, apply it to the table
8248
- elif isinstance(validation.pre, Callable):
8249
- data_tbl_step = validation.pre(data_tbl_step)
9262
+ except Exception:
9263
+ # If preprocessing fails, mark the validation as having an eval_error
9264
+ validation.eval_error = True
9265
+ end_time = datetime.datetime.now(datetime.timezone.utc)
9266
+ validation.proc_duration_s = (end_time - start_time).total_seconds()
9267
+ validation.time_processed = end_time.isoformat(timespec="milliseconds")
9268
+ validation.active = False
9269
+ continue
8250
9270
 
8251
9271
  # ------------------------------------------------
8252
9272
  # Segmentation stage
@@ -8259,12 +9279,28 @@ class Validate:
8259
9279
  data_tbl=data_tbl_step, segments_expr=validation.segments
8260
9280
  )
8261
9281
 
9282
+ # ------------------------------------------------
9283
+ # Determine table type and `collect()` if needed
9284
+ # ------------------------------------------------
9285
+
9286
+ if tbl_type not in IBIS_BACKENDS:
9287
+ tbl_type = "local"
9288
+
9289
+ # If the table is a lazy frame, we need to collect it
9290
+ if _is_lazy_frame(data_tbl_step):
9291
+ data_tbl_step = data_tbl_step.collect()
9292
+
9293
+ # ------------------------------------------------
9294
+ # Set the number of test units
9295
+ # ------------------------------------------------
9296
+
8262
9297
  validation.n = NumberOfTestUnits(df=data_tbl_step, column=column).get_test_units(
8263
9298
  tbl_type=tbl_type
8264
9299
  )
8265
9300
 
8266
- if tbl_type not in IBIS_BACKENDS:
8267
- tbl_type = "local"
9301
+ # ------------------------------------------------
9302
+ # Validation stage
9303
+ # ------------------------------------------------
8268
9304
 
8269
9305
  if assertion_category == "COMPARE_ONE":
8270
9306
  results_tbl = ColValsCompareOne(
@@ -8455,36 +9491,32 @@ class Validate:
8455
9491
 
8456
9492
  else:
8457
9493
  # If the result is not a list, then we assume it's a table in the conventional
8458
- # form (where the column is `pb_is_good_` exists, with boolean values)
8459
-
9494
+ # form (where the column is `pb_is_good_` exists, with boolean values
8460
9495
  results_tbl = results_tbl_list
8461
9496
 
8462
9497
  # If the results table is not `None`, then we assume there is a table with a column
8463
9498
  # called `pb_is_good_` that contains boolean values; we can then use this table to
8464
9499
  # determine the number of test units that passed and failed
8465
9500
  if results_tbl is not None:
8466
- # Extract the `pb_is_good_` column from the table as a results list
8467
- if tbl_type in IBIS_BACKENDS:
8468
- # Select the DataFrame library to use for getting the results list
8469
- df_lib = _select_df_lib(preference="polars")
8470
- df_lib_name = df_lib.__name__
8471
-
8472
- if df_lib_name == "pandas":
8473
- results_list = (
8474
- results_tbl.select("pb_is_good_").to_pandas()["pb_is_good_"].to_list()
8475
- )
8476
- else:
8477
- results_list = (
8478
- results_tbl.select("pb_is_good_").to_polars()["pb_is_good_"].to_list()
8479
- )
9501
+ # Count the number of passing and failing test units
9502
+ validation.n_passed = _count_true_values_in_column(
9503
+ tbl=results_tbl, column="pb_is_good_"
9504
+ )
9505
+ validation.n_failed = _count_true_values_in_column(
9506
+ tbl=results_tbl, column="pb_is_good_", inverse=True
9507
+ )
8480
9508
 
8481
- else:
8482
- results_list = nw.from_native(results_tbl)["pb_is_good_"].to_list()
9509
+ # Solely for the col_vals_in_set assertion type, any Null values in the
9510
+ # `pb_is_good_` column are counted as failing test units
9511
+ if assertion_type == "col_vals_in_set":
9512
+ null_count = _count_null_values_in_column(tbl=results_tbl, column="pb_is_good_")
9513
+ validation.n_failed += null_count
9514
+
9515
+ # For column-value validations, the number of test units is the number of rows
9516
+ validation.n = get_row_count(data=results_tbl)
8483
9517
 
8484
- validation.all_passed = all(results_list)
8485
- validation.n = len(results_list)
8486
- validation.n_passed = results_list.count(True)
8487
- validation.n_failed = results_list.count(False)
9518
+ # Set the `all_passed` attribute based on whether there are any failing test units
9519
+ validation.all_passed = validation.n_failed == 0
8488
9520
 
8489
9521
  # Calculate fractions of passing and failing test units
8490
9522
  # - `f_passed` is the fraction of test units that passed
@@ -8831,7 +9863,7 @@ class Validate:
8831
9863
  raise AssertionError(msg)
8832
9864
 
8833
9865
  def assert_below_threshold(
8834
- self, level: str = "warning", i: int = None, message: str = None
9866
+ self, level: str = "warning", i: int | None = None, message: str | None = None
8835
9867
  ) -> None:
8836
9868
  """
8837
9869
  Raise an `AssertionError` if validation steps exceed a specified threshold level.
@@ -8940,12 +9972,12 @@ class Validate:
8940
9972
 
8941
9973
  See Also
8942
9974
  --------
8943
- - [`warning()`](`pointblank.Validate.warning`): Get the 'warning' status for each validation
9975
+ - [`warning()`](`pointblank.Validate.warning`): get the 'warning' status for each validation
8944
9976
  step
8945
- - [`error()`](`pointblank.Validate.error`): Get the 'error' status for each validation step
8946
- - [`critical()`](`pointblank.Validate.critical`): Get the 'critical' status for each
9977
+ - [`error()`](`pointblank.Validate.error`): get the 'error' status for each validation step
9978
+ - [`critical()`](`pointblank.Validate.critical`): get the 'critical' status for each
8947
9979
  validation step
8948
- - [`assert_passing()`](`pointblank.Validate.assert_passing`): Assert all validations pass
9980
+ - [`assert_passing()`](`pointblank.Validate.assert_passing`): assert all validations pass
8949
9981
  completely
8950
9982
  """
8951
9983
  # Check if validation has been interrogated
@@ -8991,6 +10023,145 @@ class Validate:
8991
10023
  )
8992
10024
  raise AssertionError(msg)
8993
10025
 
10026
+ def above_threshold(self, level: str = "warning", i: int | None = None) -> bool:
10027
+ """
10028
+ Check if any validation steps exceed a specified threshold level.
10029
+
10030
+ The `above_threshold()` method checks whether validation steps exceed a given threshold
10031
+ level. This provides a non-exception-based alternative to
10032
+ [`assert_below_threshold()`](`pointblank.Validate.assert_below_threshold`) for conditional
10033
+ workflow control based on validation results.
10034
+
10035
+ This method is useful in scenarios where you want to check if any validation steps failed
10036
+ beyond a certain threshold without raising an exception, allowing for more flexible
10037
+ programmatic responses to validation issues.
10038
+
10039
+ Parameters
10040
+ ----------
10041
+ level
10042
+ The threshold level to check against. Valid options are: `"warning"` (the least severe
10043
+ threshold level), `"error"` (the middle severity threshold level), and `"critical"` (the
10044
+ most severe threshold level). The default is `"warning"`.
10045
+ i
10046
+ Specific validation step number(s) to check. If a single integer, checks only that step.
10047
+ If a list of integers, checks all specified steps. If `None` (the default), checks all
10048
+ validation steps. Step numbers are 1-based (first step is `1`, not `0`).
10049
+
10050
+ Returns
10051
+ -------
10052
+ bool
10053
+ `True` if any of the specified validation steps exceed the given threshold level,
10054
+ `False` otherwise.
10055
+
10056
+ Raises
10057
+ ------
10058
+ ValueError
10059
+ If an invalid threshold level is provided.
10060
+
10061
+ Examples
10062
+ --------
10063
+ ```{python}
10064
+ #| echo: false
10065
+ #| output: false
10066
+ import pointblank as pb
10067
+ pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
10068
+ ```
10069
+ Below are some examples of how to use the `above_threshold()` method. First, we'll create a
10070
+ simple Polars DataFrame with a single column (`values`).
10071
+
10072
+ ```{python}
10073
+ import polars as pl
10074
+
10075
+ tbl = pl.DataFrame({
10076
+ "values": [1, 2, 3, 4, 5, 0, -1]
10077
+ })
10078
+ ```
10079
+
10080
+ Then a validation plan will be created with thresholds (`warning=0.1`, `error=0.2`,
10081
+ `critical=0.3`). After interrogating, we display the validation report table:
10082
+
10083
+ ```{python}
10084
+ import pointblank as pb
10085
+
10086
+ validation = (
10087
+ pb.Validate(data=tbl, thresholds=(0.1, 0.2, 0.3))
10088
+ .col_vals_gt(columns="values", value=0)
10089
+ .col_vals_lt(columns="values", value=10)
10090
+ .col_vals_between(columns="values", left=0, right=5)
10091
+ .interrogate()
10092
+ )
10093
+
10094
+ validation
10095
+ ```
10096
+
10097
+ Let's check if any steps exceed the 'warning' threshold with the `above_threshold()` method.
10098
+ A message will be printed if that's the case:
10099
+
10100
+ ```{python}
10101
+ if validation.above_threshold(level="warning"):
10102
+ print("Some steps have exceeded the warning threshold")
10103
+ ```
10104
+
10105
+ Check if only steps 2 and 3 exceed the 'error' threshold through use of the `i=` argument:
10106
+
10107
+ ```{python}
10108
+ if validation.above_threshold(level="error", i=[2, 3]):
10109
+ print("Steps 2 and/or 3 have exceeded the error threshold")
10110
+ ```
10111
+
10112
+ You can use this in a workflow to conditionally trigger processes. Here's a snippet of how
10113
+ you might use this in a function:
10114
+
10115
+ ```python
10116
+ def process_data(validation_obj):
10117
+ # Only continue processing if validation passes critical thresholds
10118
+ if not validation_obj.above_threshold(level="critical"):
10119
+ # Continue with processing
10120
+ print("Data meets critical quality thresholds, proceeding...")
10121
+ return True
10122
+ else:
10123
+ # Log failure and stop processing
10124
+ print("Data fails critical quality checks, aborting...")
10125
+ return False
10126
+ ```
10127
+
10128
+ Note that this is just a suggestion for how to implement conditional workflow processes. You
10129
+ should adapt this pattern to your specific requirements, which might include different
10130
+ threshold levels, custom logging mechanisms, or integration with your organization's data
10131
+ pipelines and notification systems.
10132
+
10133
+ See Also
10134
+ --------
10135
+ - [`assert_below_threshold()`](`pointblank.Validate.assert_below_threshold`): a similar
10136
+ method that raises an exception if thresholds are exceeded
10137
+ - [`warning()`](`pointblank.Validate.warning`): get the 'warning' status for each validation
10138
+ step
10139
+ - [`error()`](`pointblank.Validate.error`): get the 'error' status for each validation step
10140
+ - [`critical()`](`pointblank.Validate.critical`): get the 'critical' status for each
10141
+ validation step
10142
+ """
10143
+ # Ensure validation has been run
10144
+ if not hasattr(self, "time_start") or self.time_start is None:
10145
+ return False
10146
+
10147
+ # Validate the level parameter
10148
+ level = level.lower()
10149
+ if level not in ["warning", "error", "critical"]:
10150
+ raise ValueError(
10151
+ f"Invalid threshold level: {level}. Must be one of 'warning', 'error', or 'critical'."
10152
+ )
10153
+
10154
+ # Get the threshold status using the appropriate method
10155
+ if level == "warning":
10156
+ status = self.warning(i=i)
10157
+ elif level == "error":
10158
+ status = self.error(i=i)
10159
+ elif level == "critical":
10160
+ status = self.critical(i=i)
10161
+
10162
+ # Return True if any steps exceeded the threshold
10163
+ return any(status.values())
10164
+
8994
10165
  def n(self, i: int | list[int] | None = None, scalar: bool = False) -> dict[int, int] | int:
8995
10166
  """
8996
10167
  Provides a dictionary of the number of test units for each validation step.
@@ -9654,7 +10825,7 @@ class Validate:
9654
10825
  Get the 'critical' level status for each validation step.
9655
10826
 
9656
10827
  The 'critical' status for a validation step is `True` if the fraction of failing test units
9657
- meets or exceeds the threshold for the notification level. Otherwise, the status is `False`.
10828
+ meets or exceeds the threshold for the 'critical' level. Otherwise, the status is `False`.
9658
10829
 
9659
10830
  The ascribed name of 'critical' is semantic and is thus simply a status indicator that could
9660
10831
  be used to trigger some action to be take. Here's how it fits in with other status
@@ -9666,14 +10837,14 @@ class Validate:
9666
10837
  severity
9667
10838
  - 'critical': the status obtained by calling `critical()`, most severe
9668
10839
 
9669
- This method provides a dictionary of the notification status for each validation step. If
9670
- the `scalar=True` argument is provided and `i=` is a scalar, the value is returned as a
9671
- scalar instead of a dictionary.
10840
+ This method provides a dictionary of the 'critical' status for each validation step. If the
10841
+ `scalar=True` argument is provided and `i=` is a scalar, the value is returned as a scalar
10842
+ instead of a dictionary.
9672
10843
 
9673
10844
  Parameters
9674
10845
  ----------
9675
10846
  i
9676
- The validation step number(s) from which the notification status is obtained. Can be
10847
+ The validation step number(s) from which the 'critical' status is obtained. Can be
9677
10848
  provided as a list of integers or a single integer. If `None`, all steps are included.
9678
10849
  scalar
9679
10850
  If `True` and `i=` is a scalar, return the value as a scalar instead of a dictionary.
@@ -9681,7 +10852,7 @@ class Validate:
9681
10852
  Returns
9682
10853
  -------
9683
10854
  dict[int, bool] | bool
9684
- A dictionary of the notification status for each validation step or a scalar value.
10855
+ A dictionary of the 'critical' status for each validation step or a scalar value.
9685
10856
 
9686
10857
  Examples
9687
10858
  --------
@@ -9760,11 +10931,13 @@ class Validate:
9760
10931
  Get the rows that failed for each validation step.
9761
10932
 
9762
10933
  After the [`interrogate()`](`pointblank.Validate.interrogate`) method has been called, the
9763
- `get_data_extracts()` method can be used to extract the rows that failed in each row-based
9764
- validation step (e.g., [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`), etc.). The
9765
- method returns a dictionary of tables containing the rows that failed in every row-based
9766
- validation function. If `frame=True` and `i=` is a scalar, the value is conveniently
9767
- returned as a table (forgoing the dictionary structure).
10934
+ `get_data_extracts()` method can be used to extract the rows that failed in each
10935
+ column-value or row-based validation step (e.g.,
10936
+ [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`),
10937
+ [`rows_distinct()`](`pointblank.Validate.rows_distinct`), etc.). The method returns a
10938
+ dictionary of tables containing the rows that failed in every validation step. If
10939
+ `frame=True` and `i=` is a scalar, the value is conveniently returned as a table (forgoing
10940
+ the dictionary structure).
9768
10941
 
9769
10942
  Parameters
9770
10943
  ----------
@@ -9777,13 +10950,13 @@ class Validate:
9777
10950
  Returns
9778
10951
  -------
9779
10952
  dict[int, FrameT | None] | FrameT | None
9780
- A dictionary of tables containing the rows that failed in every row-based validation
9781
- step or a DataFrame.
10953
+ A dictionary of tables containing the rows that failed in every compatible validation
10954
+ step. Alternatively, it can be a DataFrame if `frame=True` and `i=` is a scalar.
9782
10955
 
9783
- Validation Methods that are Row-Based
9784
- -------------------------------------
9785
- The following validation methods are row-based and will have rows extracted when there are
9786
- failing test units.
10956
+ Compatible Validation Methods for Yielding Extracted Rows
10957
+ ---------------------------------------------------------
10958
+ The following validation methods operate on column values and will have rows extracted when
10959
+ there are failing test units.
9787
10960
 
9788
10961
  - [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`)
9789
10962
  - [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
@@ -9798,11 +10971,20 @@ class Validate:
9798
10971
  - [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
9799
10972
  - [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
9800
10973
  - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
10974
+ - [`col_vals_expr()`](`pointblank.Validate.col_vals_expr`)
10975
+ - [`conjointly()`](`pointblank.Validate.conjointly`)
10976
+
10977
+ An extracted row for these validation methods means that a test unit failed for that row in
10978
+ the validation step.
10979
+
10980
+ These row-based validation methods will also have rows extracted should there be failing
10981
+ rows:
10982
+
9801
10983
  - [`rows_distinct()`](`pointblank.Validate.rows_distinct`)
10984
+ - [`rows_complete()`](`pointblank.Validate.rows_complete`)
9802
10985
 
9803
- An extracted row means that a test unit failed for that row in the validation step. The
9804
- extracted rows are a subset of the original table and are useful for further analysis or for
9805
- understanding the nature of the failing test units.
10986
+ The extracted rows are a subset of the original table and are useful for further analysis
10987
+ or for understanding the nature of the failing test units.
9806
10988
 
9807
10989
  Examples
9808
10990
  --------
@@ -10058,10 +11240,10 @@ class Validate:
10058
11240
  Get the data that passed or failed the validation steps.
10059
11241
 
10060
11242
  Validation of the data is one thing but, sometimes, you want to use the best part of the
10061
- input dataset for something else. The `get_sundered_data()` method works with a Validate
11243
+ input dataset for something else. The `get_sundered_data()` method works with a `Validate`
10062
11244
  object that has been interrogated (i.e., the
10063
11245
  [`interrogate()`](`pointblank.Validate.interrogate`) method was used). We can get either the
10064
- 'pass' data piece (rows with no failing test units across all row-based validation
11246
+ 'pass' data piece (rows with no failing test units across all column-value based validation
10065
11247
  functions), or, the 'fail' data piece (rows with at least one failing test unit across the
10066
11248
  same series of validations).
10067
11249
 
@@ -10070,7 +11252,7 @@ class Validate:
10070
11252
  There are some caveats to sundering. The validation steps considered for this splitting will
10071
11253
  only involve steps where:
10072
11254
 
10073
- - of certain check types, where test units are cells checked row-by-row (e.g., the
11255
+ - of certain check types, where test units are cells checked down a column (e.g., the
10074
11256
  `col_vals_*()` methods)
10075
11257
  - `active=` is not set to `False`
10076
11258
  - `pre=` has not been given an expression for modifying the input table
@@ -10301,6 +11483,19 @@ class Validate:
10301
11483
  # Get information on the input data table
10302
11484
  tbl_info = _get_tbl_type(data=self.data)
10303
11485
 
11486
+ # If the table is a Polars one, determine if it's a LazyFrame
11487
+ if tbl_info == "polars":
11488
+ if _is_lazy_frame(self.data):
11489
+ tbl_info = "polars-lazy"
11490
+
11491
+ # Determine if the input table is a Narwhals DF
11492
+ if _is_narwhals_table(self.data):
11493
+ # Determine if the Narwhals table is a LazyFrame
11494
+ if _is_lazy_frame(self.data):
11495
+ tbl_info = "narwhals-lazy"
11496
+ else:
11497
+ tbl_info = "narwhals"
11498
+
10304
11499
  # Get the thresholds object
10305
11500
  thresholds = self.thresholds
10306
11501
 
@@ -10353,7 +11548,9 @@ class Validate:
10353
11548
  # Create the label, table type, and thresholds HTML fragments
10354
11549
  label_html = _create_label_html(label=self.label, start_time="")
10355
11550
  table_type_html = _create_table_type_html(tbl_type=tbl_info, tbl_name=self.tbl_name)
10356
- thresholds_html = _create_thresholds_html(thresholds=thresholds, locale=locale)
11551
+ thresholds_html = _create_thresholds_html(
11552
+ thresholds=thresholds, locale=locale, df_lib=df_lib
11553
+ )
10357
11554
 
10358
11555
  # Compose the subtitle HTML fragment
10359
11556
  combined_subtitle = (
@@ -10666,6 +11863,7 @@ class Validate:
10666
11863
  interrogation_performed=interrogation_performed,
10667
11864
  active=active,
10668
11865
  locale=locale,
11866
+ df_lib=df_lib,
10669
11867
  )
10670
11868
 
10671
11869
  # ------------------------------------------------
@@ -10682,6 +11880,7 @@ class Validate:
10682
11880
  interrogation_performed=interrogation_performed,
10683
11881
  active=active,
10684
11882
  locale=locale,
11883
+ df_lib=df_lib,
10685
11884
  )
10686
11885
 
10687
11886
  validation_info_dict["fail"] = _transform_passed_failed(
@@ -10690,6 +11889,7 @@ class Validate:
10690
11889
  interrogation_performed=interrogation_performed,
10691
11890
  active=active,
10692
11891
  locale=locale,
11892
+ df_lib=df_lib,
10693
11893
  )
10694
11894
 
10695
11895
  # ------------------------------------------------
@@ -10869,7 +12069,9 @@ class Validate:
10869
12069
  # Create the label, table type, and thresholds HTML fragments
10870
12070
  label_html = _create_label_html(label=self.label, start_time=self.time_start)
10871
12071
  table_type_html = _create_table_type_html(tbl_type=tbl_info, tbl_name=self.tbl_name)
10872
- thresholds_html = _create_thresholds_html(thresholds=thresholds, locale=locale)
12072
+ thresholds_html = _create_thresholds_html(
12073
+ thresholds=thresholds, locale=locale, df_lib=df_lib
12074
+ )
10873
12075
 
10874
12076
  # Compose the subtitle HTML fragment
10875
12077
  combined_subtitle = (
@@ -11127,24 +12329,25 @@ class Validate:
11127
12329
  Types of Step Reports
11128
12330
  ---------------------
11129
12331
  The `get_step_report()` method produces a report based on the *type* of validation step.
11130
- The following row-based validation methods will produce a report that shows the rows of the
11131
- data that failed because of failing test units within one or more columns failed:
12332
+ The following column-value or row-based validation step validation methods will produce a
12333
+ report that shows the rows of the data that failed:
11132
12334
 
11133
12335
  - [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`)
12336
+ - [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
11134
12337
  - [`col_vals_lt()`](`pointblank.Validate.col_vals_lt`)
12338
+ - [`col_vals_le()`](`pointblank.Validate.col_vals_le`)
11135
12339
  - [`col_vals_eq()`](`pointblank.Validate.col_vals_eq`)
11136
12340
  - [`col_vals_ne()`](`pointblank.Validate.col_vals_ne`)
11137
- - [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
11138
- - [`col_vals_le()`](`pointblank.Validate.col_vals_le`)
11139
12341
  - [`col_vals_between()`](`pointblank.Validate.col_vals_between`)
11140
12342
  - [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
11141
12343
  - [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
11142
12344
  - [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
11143
- - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
11144
12345
  - [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
11145
12346
  - [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
11146
- - [`rows_complete()`](`pointblank.Validate.rows_complete`)
12347
+ - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
12348
+ - [`col_vals_expr()`](`pointblank.Validate.col_vals_expr`)
11147
12349
  - [`conjointly()`](`pointblank.Validate.conjointly`)
12350
+ - [`rows_complete()`](`pointblank.Validate.rows_complete`)
11148
12351
 
11149
12352
  The [`rows_distinct()`](`pointblank.Validate.rows_distinct`) validation step will produce a
11150
12353
  report that shows duplicate rows (or duplicate values in one or a set of columns as defined
@@ -12671,20 +13874,78 @@ def _transform_eval(
12671
13874
  return symbol_list
12672
13875
 
12673
13876
 
13877
+ def _format_numbers_with_gt(
13878
+ values: list[int], n_sigfig: int = 3, compact: bool = True, locale: str = "en"
13879
+ ) -> list[str]:
13880
+ """Format numbers using Great Tables GT object to avoid pandas dependency."""
13881
+ import polars as pl
13882
+
13883
+ # Create a single-column DataFrame with all values
13884
+ df = pl.DataFrame({"values": values})
13885
+
13886
+ # Create GT object and format the column
13887
+ gt_obj = GT(df).fmt_number(columns="values", n_sigfig=n_sigfig, compact=compact, locale=locale)
13888
+
13889
+ # Extract the formatted values using _get_column_of_values
13890
+ formatted_values = _get_column_of_values(gt_obj, column_name="values", context="html")
13891
+
13892
+ return formatted_values
13893
+
13894
+
13895
+ def _format_single_number_with_gt(
13896
+ value: int, n_sigfig: int = 3, compact: bool = True, locale: str = "en", df_lib=None
13897
+ ) -> str:
13898
+ """Format a single number using Great Tables GT object to avoid pandas dependency."""
13899
+ if df_lib is None:
13900
+ # Use library detection to select appropriate DataFrame library
13901
+ if _is_lib_present("polars"):
13902
+ import polars as pl
13903
+
13904
+ df_lib = pl
13905
+ elif _is_lib_present("pandas"):
13906
+ import pandas as pd
13907
+
13908
+ df_lib = pd
13909
+ else:
13910
+ raise ImportError("Neither Polars nor Pandas is available for formatting")
13911
+
13912
+ # Create a single-row, single-column DataFrame using the specified library
13913
+ df = df_lib.DataFrame({"value": [value]})
13914
+
13915
+ # Create GT object and format the column
13916
+ gt_obj = GT(df).fmt_number(columns="value", n_sigfig=n_sigfig, compact=compact, locale=locale)
13917
+
13918
+ # Extract the formatted value using _get_column_of_values
13919
+ formatted_values = _get_column_of_values(gt_obj, column_name="value", context="html")
13920
+
13921
+ return formatted_values[0] # Return the single formatted value
13922
+
13923
+
12674
13924
  def _transform_test_units(
12675
- test_units: list[int], interrogation_performed: bool, active: list[bool], locale: str
13925
+ test_units: list[int],
13926
+ interrogation_performed: bool,
13927
+ active: list[bool],
13928
+ locale: str,
13929
+ df_lib=None,
12676
13930
  ) -> list[str]:
12677
13931
  # If no interrogation was performed, return a list of empty strings
12678
13932
  if not interrogation_performed:
12679
13933
  return ["" for _ in range(len(test_units))]
12680
13934
 
13935
+ # Define the helper function that'll format numbers safely with Great Tables
13936
+ def _format_number_safe(value: int) -> str:
13937
+ if df_lib is not None:
13938
+ # Use GT-based formatting to avoid Pandas dependency completely
13939
+ return _format_single_number_with_gt(
13940
+ value, n_sigfig=3, compact=True, locale=locale, df_lib=df_lib
13941
+ )
13942
+ else:
13943
+ # Fallback to the original behavior
13944
+ return str(vals.fmt_number(value, n_sigfig=3, compact=True, locale=locale)[0])
13945
+
12681
13946
  return [
12682
13947
  (
12683
- (
12684
- str(test_units[i])
12685
- if test_units[i] < 10000
12686
- else str(vals.fmt_number(test_units[i], n_sigfig=3, compact=True, locale=locale)[0])
12687
- )
13948
+ (str(test_units[i]) if test_units[i] < 10000 else _format_number_safe(test_units[i]))
12688
13949
  if active[i]
12689
13950
  else "&mdash;"
12690
13951
  )
@@ -12692,8 +13953,43 @@ def _transform_test_units(
12692
13953
  ]
12693
13954
 
12694
13955
 
12695
- def _fmt_lg(value: int, locale: str) -> str:
12696
- return vals.fmt_number(value, n_sigfig=3, compact=True, locale=locale)[0]
13956
+ def _fmt_lg(value: int, locale: str, df_lib=None) -> str:
13957
+ if df_lib is not None:
13958
+ # Use GT-based formatting if a DataFrame library is provided
13959
+ return _format_single_number_with_gt(
13960
+ value, n_sigfig=3, compact=True, locale=locale, df_lib=df_lib
13961
+ )
13962
+ else:
13963
+ # Fallback to the original behavior
13964
+ return vals.fmt_number(value, n_sigfig=3, compact=True, locale=locale)[0]
13965
+
13966
+
13967
+ def _format_single_float_with_gt(
13968
+ value: float, decimals: int = 2, locale: str = "en", df_lib=None
13969
+ ) -> str:
13970
+ if df_lib is None:
13971
+ # Use library detection to select appropriate DataFrame library
13972
+ if _is_lib_present("polars"):
13973
+ import polars as pl
13974
+
13975
+ df_lib = pl
13976
+ elif _is_lib_present("pandas"):
13977
+ import pandas as pd
13978
+
13979
+ df_lib = pd
13980
+ else:
13981
+ raise ImportError("Neither Polars nor Pandas is available for formatting")
13982
+
13983
+ # Create a single-row, single-column DataFrame using the specified library
13984
+ df = df_lib.DataFrame({"value": [value]})
13985
+
13986
+ # Create GT object and format the column
13987
+ gt_obj = GT(df).fmt_number(columns="value", decimals=decimals, locale=locale)
13988
+
13989
+ # Extract the formatted value using _get_column_of_values
13990
+ formatted_values = _get_column_of_values(gt_obj, column_name="value", context="html")
13991
+
13992
+ return formatted_values[0] # Return the single formatted value
12697
13993
 
12698
13994
 
12699
13995
  def _transform_passed_failed(
@@ -12702,14 +13998,24 @@ def _transform_passed_failed(
12702
13998
  interrogation_performed: bool,
12703
13999
  active: list[bool],
12704
14000
  locale: str,
14001
+ df_lib=None,
12705
14002
  ) -> list[str]:
12706
14003
  if not interrogation_performed:
12707
14004
  return ["" for _ in range(len(n_passed_failed))]
12708
14005
 
14006
+ # Helper function to format numbers safely
14007
+ def _format_float_safe(value: float) -> str:
14008
+ if df_lib is not None:
14009
+ # Use GT-based formatting to avoid Pandas dependency completely
14010
+ return _format_single_float_with_gt(value, decimals=2, locale=locale, df_lib=df_lib)
14011
+ else:
14012
+ # Fallback to the original behavior
14013
+ return vals.fmt_number(value, decimals=2, locale=locale)[0]
14014
+
12709
14015
  passed_failed = [
12710
14016
  (
12711
- f"{n_passed_failed[i] if n_passed_failed[i] < 10000 else _fmt_lg(n_passed_failed[i], locale=locale)}"
12712
- f"<br />{vals.fmt_number(f_passed_failed[i], decimals=2, locale=locale)[0]}"
14017
+ f"{n_passed_failed[i] if n_passed_failed[i] < 10000 else _fmt_lg(n_passed_failed[i], locale=locale, df_lib=df_lib)}"
14018
+ f"<br />{_format_float_safe(f_passed_failed[i])}"
12713
14019
  if active[i]
12714
14020
  else "&mdash;"
12715
14021
  )
@@ -12920,41 +14226,122 @@ def _create_label_html(label: str | None, start_time: str) -> str:
12920
14226
  )
12921
14227
 
12922
14228
 
12923
- def _create_thresholds_html(thresholds: Thresholds, locale: str) -> str:
14229
+ def _format_single_integer_with_gt(value: int, locale: str = "en", df_lib=None) -> str:
14230
+ """Format a single integer using Great Tables GT object to avoid pandas dependency."""
14231
+ if df_lib is None:
14232
+ # Use library detection to select appropriate DataFrame library
14233
+ if _is_lib_present("polars"):
14234
+ import polars as pl
14235
+
14236
+ df_lib = pl
14237
+ elif _is_lib_present("pandas"):
14238
+ import pandas as pd
14239
+
14240
+ df_lib = pd
14241
+ else:
14242
+ raise ImportError("Neither Polars nor Pandas is available for formatting")
14243
+
14244
+ # Create a single-row, single-column DataFrame using the specified library
14245
+ df = df_lib.DataFrame({"value": [value]})
14246
+
14247
+ # Create GT object and format the column
14248
+ gt_obj = GT(df).fmt_integer(columns="value", locale=locale)
14249
+
14250
+ # Extract the formatted value using _get_column_of_values
14251
+ formatted_values = _get_column_of_values(gt_obj, column_name="value", context="html")
14252
+
14253
+ return formatted_values[0] # Return the single formatted value
14254
+
14255
+
14256
+ def _format_single_float_with_gt_custom(
14257
+ value: float,
14258
+ decimals: int = 2,
14259
+ drop_trailing_zeros: bool = False,
14260
+ locale: str = "en",
14261
+ df_lib=None,
14262
+ ) -> str:
14263
+ """Format a single float with custom options using Great Tables GT object to avoid pandas dependency."""
14264
+ if df_lib is None:
14265
+ # Use library detection to select appropriate DataFrame library
14266
+ if _is_lib_present("polars"):
14267
+ import polars as pl
14268
+
14269
+ df_lib = pl
14270
+ elif _is_lib_present("pandas"):
14271
+ import pandas as pd
14272
+
14273
+ df_lib = pd
14274
+ else:
14275
+ raise ImportError("Neither Polars nor Pandas is available for formatting")
14276
+
14277
+ # Create a single-row, single-column DataFrame using the specified library
14278
+ df = df_lib.DataFrame({"value": [value]})
14279
+
14280
+ # Create GT object and format the column
14281
+ gt_obj = GT(df).fmt_number(
14282
+ columns="value", decimals=decimals, drop_trailing_zeros=drop_trailing_zeros, locale=locale
14283
+ )
14284
+
14285
+ # Extract the formatted value using _get_column_of_values
14286
+ formatted_values = _get_column_of_values(gt_obj, column_name="value", context="html")
14287
+
14288
+ return formatted_values[0] # Return the single formatted value
14289
+
14290
+
14291
+ def _create_thresholds_html(thresholds: Thresholds, locale: str, df_lib=None) -> str:
12924
14292
  if thresholds == Thresholds():
12925
14293
  return ""
12926
14294
 
14295
+ # Helper functions to format numbers safely
14296
+ def _format_number_safe(value: float, decimals: int, drop_trailing_zeros: bool = False) -> str:
14297
+ if df_lib is not None and value is not None:
14298
+ # Use GT-based formatting to avoid Pandas dependency completely
14299
+ return _format_single_float_with_gt_custom(
14300
+ value,
14301
+ decimals=decimals,
14302
+ drop_trailing_zeros=drop_trailing_zeros,
14303
+ locale=locale,
14304
+ df_lib=df_lib,
14305
+ )
14306
+ else:
14307
+ # Fallback to the original behavior
14308
+ return fmt_number(
14309
+ value, decimals=decimals, drop_trailing_zeros=drop_trailing_zeros, locale=locale
14310
+ )[0]
14311
+
14312
+ def _format_integer_safe(value: int) -> str:
14313
+ if df_lib is not None and value is not None:
14314
+ # Use GT-based formatting to avoid Pandas dependency completely
14315
+ return _format_single_integer_with_gt(value, locale=locale, df_lib=df_lib)
14316
+ else:
14317
+ # Fallback to the original behavior
14318
+ return fmt_integer(value, locale=locale)[0]
14319
+
12927
14320
  warning = (
12928
- fmt_number(
12929
- thresholds.warning_fraction, decimals=3, drop_trailing_zeros=True, locale=locale
12930
- )[0]
14321
+ _format_number_safe(thresholds.warning_fraction, decimals=3, drop_trailing_zeros=True)
12931
14322
  if thresholds.warning_fraction is not None
12932
14323
  else (
12933
- fmt_integer(thresholds.warning_count, locale=locale)[0]
14324
+ _format_integer_safe(thresholds.warning_count)
12934
14325
  if thresholds.warning_count is not None
12935
14326
  else "&mdash;"
12936
14327
  )
12937
14328
  )
12938
14329
 
12939
14330
  error = (
12940
- fmt_number(thresholds.error_fraction, decimals=3, drop_trailing_zeros=True, locale=locale)[
12941
- 0
12942
- ]
14331
+ _format_number_safe(thresholds.error_fraction, decimals=3, drop_trailing_zeros=True)
12943
14332
  if thresholds.error_fraction is not None
12944
14333
  else (
12945
- fmt_integer(thresholds.error_count, locale=locale)[0]
14334
+ _format_integer_safe(thresholds.error_count)
12946
14335
  if thresholds.error_count is not None
12947
14336
  else "&mdash;"
12948
14337
  )
12949
14338
  )
12950
14339
 
12951
14340
  critical = (
12952
- fmt_number(
12953
- thresholds.critical_fraction, decimals=3, drop_trailing_zeros=True, locale=locale
12954
- )[0]
14341
+ _format_number_safe(thresholds.critical_fraction, decimals=3, drop_trailing_zeros=True)
12955
14342
  if thresholds.critical_fraction is not None
12956
14343
  else (
12957
- fmt_integer(thresholds.critical_count, locale=locale)[0]
14344
+ _format_integer_safe(thresholds.critical_count)
12958
14345
  if thresholds.critical_count is not None
12959
14346
  else "&mdash;"
12960
14347
  )