pointblank 0.9.6__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pointblank/validate.py CHANGED
@@ -17,6 +17,7 @@ from zipfile import ZipFile
17
17
  import commonmark
18
18
  import narwhals as nw
19
19
  from great_tables import GT, from_column, google_font, html, loc, md, style, vals
20
+ from great_tables.gt import _get_column_of_values
20
21
  from great_tables.vals import fmt_integer, fmt_number
21
22
  from importlib_resources import files
22
23
  from narwhals.typing import FrameT
@@ -64,11 +65,15 @@ from pointblank._typing import SegmentSpec
64
65
  from pointblank._utils import (
65
66
  _check_any_df_lib,
66
67
  _check_invalid_fields,
68
+ _count_null_values_in_column,
69
+ _count_true_values_in_column,
67
70
  _derive_bounds,
68
71
  _format_to_integer_value,
69
72
  _get_fn_name,
70
73
  _get_tbl_type,
74
+ _is_lazy_frame,
71
75
  _is_lib_present,
76
+ _is_narwhals_table,
72
77
  _is_value_a_df,
73
78
  _select_df_lib,
74
79
  )
@@ -99,11 +104,13 @@ __all__ = [
99
104
  "Validate",
100
105
  "load_dataset",
101
106
  "config",
107
+ "connect_to_table",
102
108
  "preview",
103
109
  "missing_vals_tbl",
110
+ "get_action_metadata",
104
111
  "get_column_count",
112
+ "get_data_path",
105
113
  "get_row_count",
106
- "get_action_metadata",
107
114
  "get_validation_summary",
108
115
  ]
109
116
 
@@ -495,7 +502,9 @@ def load_dataset(
495
502
  raise ValueError(
496
503
  f"The dataset name `{dataset}` is not valid. Choose one of the following:\n"
497
504
  "- `small_table`\n"
498
- "- `game_revenue`"
505
+ "- `game_revenue`\n"
506
+ "- `nycflights`\n"
507
+ "- `global_sales`"
499
508
  )
500
509
 
501
510
  # Raise an error if the `tbl_type=` value is not of the supported types
@@ -560,6 +569,405 @@ def load_dataset(
560
569
  return dataset
561
570
 
562
571
 
572
+ def get_data_path(
573
+ dataset: Literal["small_table", "game_revenue", "nycflights", "global_sales"] = "small_table",
574
+ file_type: Literal["csv", "parquet", "duckdb"] = "csv",
575
+ ) -> str:
576
+ """
577
+ Get the file path to a dataset included with the Pointblank package.
578
+
579
+ This function provides direct access to the file paths of datasets included with Pointblank.
580
+ These paths can be used in examples and documentation to demonstrate file-based data loading
581
+ without requiring the actual data files. The returned paths can be used with
582
+ `Validate(data=path)` to demonstrate CSV and Parquet file loading capabilities.
583
+
584
+ Parameters
585
+ ----------
586
+ dataset
587
+ The name of the dataset to get the path for. Current options are `"small_table"`,
588
+ `"game_revenue"`, `"nycflights"`, and `"global_sales"`.
589
+ file_type
590
+ The file format to get the path for. Options are `"csv"`, `"parquet"`, or `"duckdb"`.
591
+
592
+ Returns
593
+ -------
594
+ str
595
+ The file path to the requested dataset file.
596
+
597
+ Included Datasets
598
+ -----------------
599
+ The available datasets are the same as those in [`load_dataset()`](`pointblank.load_dataset`):
600
+
601
+ - `"small_table"`: A small dataset with 13 rows and 8 columns. Ideal for testing and examples.
602
+ - `"game_revenue"`: A dataset with 2000 rows and 11 columns. Revenue data for a game company.
603
+ - `"nycflights"`: A dataset with 336,776 rows and 18 columns. Flight data from NYC airports.
604
+ - `"global_sales"`: A dataset with 50,000 rows and 20 columns. Global sales data across regions.
605
+
606
+ File Types
607
+ ----------
608
+ Each dataset is available in multiple formats:
609
+
610
+ - `"csv"`: Comma-separated values file (`.csv`)
611
+ - `"parquet"`: Parquet file (`.parquet`)
612
+ - `"duckdb"`: DuckDB database file (`.ddb`)
613
+
614
+ Examples
615
+ --------
616
+ Get the path to a CSV file and use it with `Validate`:
617
+
618
+ ```{python}
619
+ import pointblank as pb
620
+
621
+ # Get path to the small_table CSV file
622
+ csv_path = pb.get_data_path("small_table", "csv")
623
+ print(csv_path)
624
+
625
+ # Use the path directly with Validate
626
+ validation = (
627
+ pb.Validate(data=csv_path)
628
+ .col_exists(["a", "b", "c"])
629
+ .col_vals_gt(columns="d", value=0)
630
+ .interrogate()
631
+ )
632
+
633
+ validation
634
+ ```
635
+
636
+ Get a Parquet file path for validation examples:
637
+
638
+ ```{python}
639
+ # Get path to the game_revenue Parquet file
640
+ parquet_path = pb.get_data_path(dataset="game_revenue", file_type="parquet")
641
+
642
+ # Validate the Parquet file directly
643
+ validation = (
644
+ pb.Validate(data=parquet_path, label="Game Revenue Data Validation")
645
+ .col_vals_not_null(columns=["player_id", "session_id"])
646
+ .col_vals_gt(columns="item_revenue", value=0)
647
+ .interrogate()
648
+ )
649
+
650
+ validation
651
+ ```
652
+
653
+ This is particularly useful for documentation examples where you want to demonstrate
654
+ file-based workflows without requiring users to have specific data files:
655
+
656
+ ```{python}
657
+ # Example showing CSV file validation
658
+ sales_csv = pb.get_data_path(dataset="global_sales", file_type="csv")
659
+
660
+ validation = (
661
+ pb.Validate(data=sales_csv, label="Sales Data Validation")
662
+ .col_exists(["customer_id", "product_id", "amount"])
663
+ .col_vals_regex(columns="customer_id", pattern=r"CUST_[0-9]{6}")
664
+ .interrogate()
665
+ )
666
+ ```
667
+
668
+ See Also
669
+ --------
670
+ [`load_dataset()`](`pointblank.load_dataset`) for loading datasets directly as table objects.
671
+ """
672
+
673
+ # Validate inputs
674
+ if dataset not in ["small_table", "game_revenue", "nycflights", "global_sales"]:
675
+ raise ValueError(
676
+ f"The dataset name `{dataset}` is not valid. Choose one of the following:\n"
677
+ "- `small_table`\n"
678
+ "- `game_revenue`\n"
679
+ "- `nycflights`\n"
680
+ "- `global_sales`"
681
+ )
682
+
683
+ if file_type not in ["csv", "parquet", "duckdb"]:
684
+ raise ValueError(
685
+ f"The file type `{file_type}` is not valid. Choose one of the following:\n"
686
+ "- `csv`\n"
687
+ "- `parquet`\n"
688
+ "- `duckdb`"
689
+ )
690
+
691
+ if file_type == "csv":
692
+ # Return path to CSV file inside the zip
693
+ data_path = files("pointblank.data") / f"{dataset}.zip"
694
+
695
+ # For CSV files, we need to extract from zip to a temporary location
696
+ # since most libraries expect actual file paths, not zip contents
697
+ with tempfile.NamedTemporaryFile(mode="wb", suffix=".csv", delete=False) as tmp_file:
698
+ with ZipFile(data_path) as zip_file:
699
+ csv_content = zip_file.read(f"{dataset}.csv")
700
+ tmp_file.write(csv_content)
701
+ return tmp_file.name
702
+
703
+ elif file_type == "parquet":
704
+ # Create a temporary parquet file from the CSV data
705
+ data_path = files("pointblank.data") / f"{dataset}.zip"
706
+
707
+ # We'll need to convert CSV to Parquet temporarily
708
+ with tempfile.NamedTemporaryFile(mode="wb", suffix=".parquet", delete=False) as tmp_file:
709
+ # Load CSV data and save as Parquet
710
+ if _is_lib_present(lib_name="polars"):
711
+ import polars as pl
712
+
713
+ df = pl.read_csv(ZipFile(data_path).read(f"{dataset}.csv"), try_parse_dates=True)
714
+ df.write_parquet(tmp_file.name)
715
+ elif _is_lib_present(lib_name="pandas"):
716
+ import pandas as pd
717
+
718
+ df = pd.read_csv(data_path)
719
+ df.to_parquet(tmp_file.name, index=False)
720
+ else:
721
+ raise ImportError(
722
+ "Either Polars or Pandas is required to create temporary Parquet files."
723
+ )
724
+ return tmp_file.name
725
+
726
+ elif file_type == "duckdb":
727
+ # Return path to DuckDB file
728
+ data_path = files("pointblank.data") / f"{dataset}-duckdb.zip"
729
+
730
+ # Extract DuckDB file to temporary location
731
+ with tempfile.NamedTemporaryFile(mode="wb", suffix=".ddb", delete=False) as tmp_file:
732
+ with ZipFile(data_path) as zip_file:
733
+ ddb_content = zip_file.read(f"{dataset}.ddb")
734
+ tmp_file.write(ddb_content)
735
+ return tmp_file.name
736
+
737
+
738
+ # =============================================================================
739
+ # Utility functions for processing input data (shared by preview() and Validate class)
740
+ # =============================================================================
741
+
742
+
743
+ def _process_connection_string(data: FrameT | Any) -> FrameT | Any:
744
+ """
745
+ Process data parameter to handle database connection strings.
746
+
747
+ Uses the `connect_to_table()` utility function to handle URI-formatted connection strings with
748
+ table specifications. Returns the original data if it's not a connection string.
749
+
750
+ For more details on supported connection string formats, see the documentation
751
+ for `connect_to_table()`.
752
+ """
753
+ # Check if data is a string that looks like a connection string
754
+ if not isinstance(data, str):
755
+ return data
756
+
757
+ # Basic connection string patterns
758
+ connection_patterns = [
759
+ "://", # General URL-like pattern
760
+ ]
761
+
762
+ # Check if it looks like a connection string
763
+ if not any(pattern in data for pattern in connection_patterns):
764
+ return data
765
+
766
+ # Use the utility function to connect to the table
767
+ return connect_to_table(data)
768
+
769
+
770
+ def _process_csv_input(data: FrameT | Any) -> FrameT | Any:
771
+ """
772
+ Process data parameter to handle CSV file inputs.
773
+
774
+ If data is a string or Path with .csv extension, reads the CSV file
775
+ using available libraries (Polars preferred, then Pandas).
776
+
777
+ Returns the original data if it's not a CSV file path.
778
+ """
779
+ from pathlib import Path
780
+
781
+ # Check if data is a string or Path-like object with .csv extension
782
+ csv_path = None
783
+
784
+ if isinstance(data, (str, Path)):
785
+ path_obj = Path(data)
786
+ if path_obj.suffix.lower() == ".csv":
787
+ csv_path = path_obj
788
+
789
+ # If it's not a CSV file path, return the original data
790
+ if csv_path is None:
791
+ return data
792
+
793
+ # Check if the CSV file exists
794
+ if not csv_path.exists():
795
+ raise FileNotFoundError(f"CSV file not found: {csv_path}")
796
+
797
+ # Determine which library to use for reading CSV
798
+ # Prefer Polars, fallback to Pandas
799
+ if _is_lib_present(lib_name="polars"):
800
+ try:
801
+ import polars as pl
802
+
803
+ return pl.read_csv(csv_path, try_parse_dates=True)
804
+ except Exception as e:
805
+ # If Polars fails, try Pandas if available
806
+ if _is_lib_present(lib_name="pandas"):
807
+ import pandas as pd
808
+
809
+ return pd.read_csv(csv_path)
810
+ else:
811
+ raise RuntimeError(
812
+ f"Failed to read CSV file with Polars: {e}. "
813
+ "Pandas is not available as fallback."
814
+ ) from e
815
+ elif _is_lib_present(lib_name="pandas"):
816
+ try:
817
+ import pandas as pd
818
+
819
+ return pd.read_csv(csv_path)
820
+ except Exception as e:
821
+ raise RuntimeError(f"Failed to read CSV file with Pandas: {e}") from e
822
+ else:
823
+ raise ImportError(
824
+ "Neither Polars nor Pandas is available for reading CSV files. "
825
+ "Please install either 'polars' or 'pandas' to use CSV file inputs."
826
+ )
827
+
828
+
829
+ def _process_parquet_input(data: FrameT | Any) -> FrameT | Any:
830
+ """
831
+ Process data parameter to handle Parquet file inputs.
832
+
833
+ Supports:
834
+ - single .parquet file (string or Path)
835
+ - glob patterns for multiple .parquet files (e.g., "data/*.parquet")
836
+ - directory containing .parquet files
837
+ - partitioned Parquet datasets with automatic partition column inference
838
+ - list/sequence of .parquet file paths
839
+
840
+ Returns the original data if it's not a Parquet file input.
841
+ """
842
+ import glob
843
+ from pathlib import Path
844
+
845
+ parquet_paths = []
846
+
847
+ # Handle different input types
848
+ if isinstance(data, (str, Path)):
849
+ data_str = str(data)
850
+ path_obj = Path(data)
851
+
852
+ # Check if it's a glob pattern containing .parquet first; look for glob
853
+ # characters: `*`, `?`, `[`, `]`
854
+ if ".parquet" in data_str.lower() and any(
855
+ char in data_str for char in ["*", "?", "[", "]"]
856
+ ):
857
+ parquet_files = glob.glob(data_str)
858
+ if parquet_files:
859
+ parquet_paths = sorted([Path(f) for f in parquet_files])
860
+ else:
861
+ raise FileNotFoundError(f"No files found matching pattern: {data}")
862
+
863
+ # Check if it's a single .parquet file
864
+ elif path_obj.suffix.lower() == ".parquet":
865
+ if path_obj.exists():
866
+ parquet_paths = [path_obj]
867
+ else:
868
+ raise FileNotFoundError(f"Parquet file not found: {path_obj}")
869
+
870
+ # Check if it's a directory
871
+ elif path_obj.is_dir():
872
+ # First, try to read as a partitioned parquet dataset; This handles datasets where
873
+ # Parquet files are in subdirectories with partition columns encoded in paths
874
+ try:
875
+ # Both Polars and Pandas can handle partitioned datasets natively
876
+ if _is_lib_present(lib_name="polars"):
877
+ import polars as pl
878
+
879
+ # Try reading as partitioned dataset first
880
+ df = pl.read_parquet(str(path_obj))
881
+ return df
882
+ elif _is_lib_present(lib_name="pandas"):
883
+ import pandas as pd
884
+
885
+ # Try reading as partitioned dataset first
886
+ df = pd.read_parquet(str(path_obj))
887
+ return df
888
+ except Exception:
889
+ # If partitioned read fails, fall back to simple directory scan
890
+ pass
891
+
892
+ # Fallback: Look for .parquet files directly in the directory
893
+ parquet_files = list(path_obj.glob("*.parquet"))
894
+ if parquet_files:
895
+ parquet_paths = sorted(parquet_files)
896
+ else:
897
+ raise FileNotFoundError(
898
+ f"No .parquet files found in directory: {path_obj}. "
899
+ f"This could be a non-partitioned directory without .parquet files, "
900
+ f"or a partitioned dataset that couldn't be read."
901
+ )
902
+
903
+ # If it's not a parquet file, directory, or glob pattern, return original data
904
+ else:
905
+ return data
906
+
907
+ # Handle list/sequence of paths
908
+ elif isinstance(data, (list, tuple)):
909
+ for item in data:
910
+ item_path = Path(item)
911
+ if item_path.suffix.lower() == ".parquet":
912
+ if item_path.exists():
913
+ parquet_paths.append(item_path)
914
+ else:
915
+ raise FileNotFoundError(f"Parquet file not found: {item_path}")
916
+ else:
917
+ # If any item is not a parquet file, return original data
918
+ return data
919
+
920
+ # If no parquet files found, return original data
921
+ if not parquet_paths:
922
+ return data
923
+
924
+ # Read the parquet file(s) using available libraries; prefer Polars, fallback to Pandas
925
+ if _is_lib_present(lib_name="polars"):
926
+ try:
927
+ import polars as pl
928
+
929
+ if len(parquet_paths) == 1:
930
+ # Single file
931
+ return pl.read_parquet(parquet_paths[0])
932
+ else:
933
+ # Multiple files: concatenate them
934
+ dfs = [pl.read_parquet(path) for path in parquet_paths]
935
+ return pl.concat(dfs, how="vertical_relaxed")
936
+ except Exception as e:
937
+ # If Polars fails, try Pandas if available
938
+ if _is_lib_present(lib_name="pandas"):
939
+ import pandas as pd
940
+
941
+ if len(parquet_paths) == 1:
942
+ return pd.read_parquet(parquet_paths[0])
943
+ else:
944
+ # Multiple files: concatenate them
945
+ dfs = [pd.read_parquet(path) for path in parquet_paths]
946
+ return pd.concat(dfs, ignore_index=True)
947
+ else:
948
+ raise RuntimeError(
949
+ f"Failed to read Parquet file(s) with Polars: {e}. "
950
+ "Pandas is not available as fallback."
951
+ ) from e
952
+ elif _is_lib_present(lib_name="pandas"):
953
+ try:
954
+ import pandas as pd
955
+
956
+ if len(parquet_paths) == 1:
957
+ return pd.read_parquet(parquet_paths[0])
958
+ else:
959
+ # Multiple files: concatenate them
960
+ dfs = [pd.read_parquet(path) for path in parquet_paths]
961
+ return pd.concat(dfs, ignore_index=True)
962
+ except Exception as e:
963
+ raise RuntimeError(f"Failed to read Parquet file(s) with Pandas: {e}") from e
964
+ else:
965
+ raise ImportError(
966
+ "Neither Polars nor Pandas is available for reading Parquet files. "
967
+ "Please install either 'polars' or 'pandas' to use Parquet file inputs."
968
+ )
969
+
970
+
563
971
  def preview(
564
972
  data: FrameT | Any,
565
973
  columns_subset: str | list[str] | Column | None = None,
@@ -590,8 +998,14 @@ def preview(
590
998
  Parameters
591
999
  ----------
592
1000
  data
593
- The table to preview, which could be a DataFrame object or an Ibis table object. Read the
594
- *Supported Input Table Types* section for details on the supported table types.
1001
+ The table to preview, which could be a DataFrame object, an Ibis table object, a CSV
1002
+ file path, a Parquet file path, or a database connection string. When providing a CSV or
1003
+ Parquet file path (as a string or `pathlib.Path` object), the file will be automatically
1004
+ loaded using an available DataFrame library (Polars or Pandas). Parquet input also supports
1005
+ glob patterns, directories containing .parquet files, and Spark-style partitioned datasets.
1006
+ Connection strings enable direct database access via Ibis with optional table specification
1007
+ using the `::table_name` suffix. Read the *Supported Input Table Types* section for details
1008
+ on the supported table types.
595
1009
  columns_subset
596
1010
  The columns to display in the table, by default `None` (all columns are shown). This can
597
1011
  be a string, a list of strings, a `Column` object, or a `ColumnSelector` object. The latter
@@ -642,12 +1056,34 @@ def preview(
642
1056
  - PySpark table (`"pyspark"`)*
643
1057
  - BigQuery table (`"bigquery"`)*
644
1058
  - Parquet table (`"parquet"`)*
1059
+ - CSV files (string path or `pathlib.Path` object with `.csv` extension)
1060
+ - Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
1061
+ extension, or partitioned dataset)
1062
+ - Database connection strings (URI format with optional table specification)
645
1063
 
646
1064
  The table types marked with an asterisk need to be prepared as Ibis tables (with type of
647
1065
  `ibis.expr.types.relations.Table`). Furthermore, using `preview()` with these types of tables
648
1066
  requires the Ibis library (`v9.5.0` or above) to be installed. If the input table is a Polars or
649
1067
  Pandas DataFrame, the availability of Ibis is not needed.
650
1068
 
1069
+ To use a CSV file, ensure that a string or `pathlib.Path` object with a `.csv` extension is
1070
+ provided. The file will be automatically detected and loaded using the best available DataFrame
1071
+ library. The loading preference is Polars first, then Pandas as a fallback.
1072
+
1073
+ Connection strings follow database URL formats and must also specify a table using the
1074
+ `::table_name` suffix. Examples include:
1075
+
1076
+ ```
1077
+ "duckdb:///path/to/database.ddb::table_name"
1078
+ "sqlite:///path/to/database.db::table_name"
1079
+ "postgresql://user:password@localhost:5432/database::table_name"
1080
+ "mysql://user:password@localhost:3306/database::table_name"
1081
+ "bigquery://project/dataset::table_name"
1082
+ "snowflake://user:password@account/database/schema::table_name"
1083
+ ```
1084
+
1085
+ When using connection strings, the Ibis library with the appropriate backend driver is required.
1086
+
651
1087
  Examples
652
1088
  --------
653
1089
  It's easy to preview a table using the `preview()` function. Here's an example using the
@@ -714,8 +1150,80 @@ def preview(
714
1150
  columns_subset=pb.col(pb.starts_with("item") | pb.matches("player"))
715
1151
  )
716
1152
  ```
1153
+
1154
+ ### Working with CSV Files
1155
+
1156
+ The `preview()` function can directly accept CSV file paths, making it easy to preview data
1157
+ stored in CSV files without manual loading:
1158
+
1159
+ ```{python}
1160
+ # Get a path to a CSV file from the package data
1161
+ csv_path = pb.get_data_path("global_sales", "csv")
1162
+
1163
+ pb.preview(csv_path)
1164
+ ```
1165
+
1166
+ You can also use a Path object to specify the CSV file:
1167
+
1168
+ ```{python}
1169
+ from pathlib import Path
1170
+
1171
+ csv_file = Path(pb.get_data_path("game_revenue", "csv"))
1172
+
1173
+ pb.preview(csv_file, n_head=3, n_tail=3)
1174
+ ```
1175
+
1176
+ ### Working with Parquet Files
1177
+
1178
+ The `preview()` function can directly accept Parquet files and datasets in various formats:
1179
+
1180
+ ```{python}
1181
+ # Single Parquet file from package data
1182
+ parquet_path = pb.get_data_path("nycflights", "parquet")
1183
+
1184
+ pb.preview(parquet_path)
1185
+ ```
1186
+
1187
+ You can also use glob patterns and directories:
1188
+
1189
+ ```python
1190
+ # Multiple Parquet files with glob patterns
1191
+ pb.preview("data/sales_*.parquet")
1192
+
1193
+ # Directory containing Parquet files
1194
+ pb.preview("parquet_data/")
1195
+
1196
+ # Partitioned Parquet dataset
1197
+ pb.preview("sales_data/") # Auto-discovers partition columns
1198
+ ```
1199
+
1200
+ ### Working with Database Connection Strings
1201
+
1202
+ The `preview()` function supports database connection strings for direct preview of database
1203
+ tables. Connection strings must specify a table using the `::table_name` suffix:
1204
+
1205
+ ```{python}
1206
+ # Get path to a DuckDB database file from package data
1207
+ duckdb_path = pb.get_data_path("game_revenue", "duckdb")
1208
+
1209
+ pb.preview(f"duckdb:///{duckdb_path}::game_revenue")
1210
+ ```
1211
+
1212
+ For comprehensive documentation on supported connection string formats, error handling, and
1213
+ installation requirements, see the [`connect_to_table()`](`pointblank.connect_to_table`)
1214
+ function.
717
1215
  """
718
1216
 
1217
+ # Process input data to handle different data source types
1218
+ # Handle connection string input (e.g., "duckdb:///path/to/file.ddb::table_name")
1219
+ data = _process_connection_string(data)
1220
+
1221
+ # Handle CSV file input (e.g., "data.csv" or Path("data.csv"))
1222
+ data = _process_csv_input(data)
1223
+
1224
+ # Handle Parquet file input (e.g., "data.parquet", "data/*.parquet", "data/")
1225
+ data = _process_parquet_input(data)
1226
+
719
1227
  if incl_header is None:
720
1228
  incl_header = global_config.preview_incl_header
721
1229
 
@@ -913,7 +1421,7 @@ def _generate_display_table(
913
1421
  k: v.split("(")[0] if "(" in v else v for k, v in col_dtype_dict.items()
914
1422
  }
915
1423
 
916
- # Create a dictionary of column and row positions where the value is None/NA/NULL
1424
+ # Create a dictionary of column and row positions where the value is None/NA/Null
917
1425
  # This is used to highlight these values in the table
918
1426
  if df_lib_name_gt == "polars":
919
1427
  none_values = {k: data[k].is_null().to_list() for k in col_names}
@@ -937,7 +1445,10 @@ def _generate_display_table(
937
1445
  column_values = gt.gt._get_column_of_values(built_gt, column_name=column, context="html")
938
1446
 
939
1447
  # Get the maximum number of characters in the column
940
- max_length_col_vals.append(max([len(str(val)) for val in column_values]))
1448
+ if column_values: # Check if column_values is not empty
1449
+ max_length_col_vals.append(max([len(str(val)) for val in column_values]))
1450
+ else:
1451
+ max_length_col_vals.append(0) # Use 0 for empty columns
941
1452
 
942
1453
  length_col_names = [len(column) for column in col_dtype_dict.keys()]
943
1454
  length_data_types = [len(dtype) for dtype in col_dtype_dict_short.values()]
@@ -1008,8 +1519,12 @@ def _generate_display_table(
1008
1519
 
1009
1520
  # Get the highest number in the `row_number_list` and calculate a width that will
1010
1521
  # safely fit a number of that magnitude
1011
- max_row_num = max(row_number_list)
1012
- max_row_num_width = len(str(max_row_num)) * 7.8 + 10
1522
+ if row_number_list: # Check if list is not empty
1523
+ max_row_num = max(row_number_list)
1524
+ max_row_num_width = len(str(max_row_num)) * 7.8 + 10
1525
+ else:
1526
+ # If row_number_list is empty, use a default width
1527
+ max_row_num_width = 7.8 * 2 + 10 # Width for 2-digit numbers
1013
1528
 
1014
1529
  # Update the col_width_dict to include the row number column
1015
1530
  col_width_dict = {"_row_num_": f"{max_row_num_width}px"} | col_width_dict
@@ -1722,6 +2237,9 @@ def get_column_count(data: FrameT | Any) -> int:
1722
2237
  elif "pandas" in str(type(data)):
1723
2238
  return data.shape[1]
1724
2239
 
2240
+ elif "narwhals" in str(type(data)):
2241
+ return len(data.columns)
2242
+
1725
2243
  else:
1726
2244
  raise ValueError("The input table type supplied in `data=` is not supported.")
1727
2245
 
@@ -1815,6 +2333,9 @@ def get_row_count(data: FrameT | Any) -> int:
1815
2333
  elif "pandas" in str(type(data)):
1816
2334
  return data.shape[0]
1817
2335
 
2336
+ elif "narwhals" in str(type(data)):
2337
+ return data.shape[0]
2338
+
1818
2339
  else:
1819
2340
  raise ValueError("The input table type supplied in `data=` is not supported.")
1820
2341
 
@@ -1930,6 +2451,239 @@ class _ValidationInfo:
1930
2451
  return self.val_info
1931
2452
 
1932
2453
 
2454
+ def connect_to_table(connection_string: str) -> Any:
2455
+ """
2456
+ Connect to a database table using a connection string.
2457
+
2458
+ This utility function tests whether a connection string leads to a valid table and returns
2459
+ the table object if successful. It provides helpful error messages when no table is specified
2460
+ or when backend dependencies are missing.
2461
+
2462
+ Parameters
2463
+ ----------
2464
+ connection_string
2465
+ A database connection string with a required table specification using the `::table_name`
2466
+ suffix. Supported formats are outlined in the *Supported Connection String Formats* section.
2467
+
2468
+ Returns
2469
+ -------
2470
+ Any
2471
+ An Ibis table object for the specified database table.
2472
+
2473
+ Supported Connection String Formats
2474
+ -----------------------------------
2475
+ The `connection_string` parameter must include a valid connection string with a table name
2476
+ specified using the `::` syntax. Here are some examples on how to format connection strings
2477
+ for various backends:
2478
+
2479
+ ```
2480
+ DuckDB: "duckdb:///path/to/database.ddb::table_name"
2481
+ SQLite: "sqlite:///path/to/database.db::table_name"
2482
+ PostgreSQL: "postgresql://user:password@localhost:5432/database::table_name"
2483
+ MySQL: "mysql://user:password@localhost:3306/database::table_name"
2484
+ BigQuery: "bigquery://project/dataset::table_name"
2485
+ Snowflake: "snowflake://user:password@account/database/schema::table_name"
2486
+ ```
2487
+
2488
+ If the connection string does not include a table name, the function will attempt to connect to
2489
+ the database and list available tables, providing guidance on how to specify a table.
2490
+
2491
+ Examples
2492
+ --------
2493
+ Connect to a DuckDB table:
2494
+
2495
+ ```{python}
2496
+ import pointblank as pb
2497
+
2498
+ # Get path to a DuckDB database file from package data
2499
+ duckdb_path = pb.get_data_path("game_revenue", "duckdb")
2500
+
2501
+ # Connect to the `game_revenue` table in the DuckDB database
2502
+ game_revenue = pb.connect_to_table(f"duckdb:///{duckdb_path}::game_revenue")
2503
+
2504
+ # Use with the `preview()` function
2505
+ pb.preview(game_revenue)
2506
+ ```
2507
+
2508
+ Here are some backend-specific connection examples:
2509
+
2510
+ ```python
2511
+ # PostgreSQL
2512
+ pg_table = pb.connect_to_table(
2513
+ "postgresql://user:password@localhost:5432/warehouse::customer_data"
2514
+ )
2515
+
2516
+ # SQLite
2517
+ sqlite_table = pb.connect_to_table("sqlite:///local_data.db::products")
2518
+
2519
+ # BigQuery
2520
+ bq_table = pb.connect_to_table("bigquery://my-project/analytics::daily_metrics")
2521
+ ```
2522
+
2523
+ This function requires the Ibis library with appropriate backend drivers:
2524
+
2525
+ ```bash
2526
+ # You can install a set of common backends:
2527
+ pip install 'ibis-framework[duckdb,postgres,mysql,sqlite]'
2528
+
2529
+ # ...or specific backends as needed:
2530
+ pip install 'ibis-framework[duckdb]' # for DuckDB
2531
+ pip install 'ibis-framework[postgres]' # for PostgreSQL
2532
+ ```
2533
+ """
2534
+ # Check if Ibis is available
2535
+ if not _is_lib_present(lib_name="ibis"):
2536
+ raise ImportError(
2537
+ "The Ibis library is not installed but is required for database connection strings.\n"
2538
+ "Install it with: pip install 'ibis-framework[duckdb]' (or other backend as needed)"
2539
+ )
2540
+
2541
+ import ibis
2542
+
2543
+ # Check if connection string includes table specification
2544
+ if "::" not in connection_string:
2545
+ # Try to connect to get available tables for helpful error message
2546
+ try:
2547
+ # Extract the base connection string (without table name)
2548
+ base_connection = connection_string
2549
+
2550
+ # Connect to the database
2551
+ conn = ibis.connect(base_connection)
2552
+
2553
+ # Get list of available tables
2554
+ try:
2555
+ available_tables = conn.list_tables()
2556
+ except Exception:
2557
+ available_tables = []
2558
+
2559
+ conn.disconnect()
2560
+
2561
+ # Create helpful error message
2562
+ if available_tables:
2563
+ table_list = "\n".join(f" - {table}" for table in available_tables)
2564
+ error_msg = (
2565
+ f"No table specified in connection string: {connection_string}\n\n"
2566
+ f"Available tables in the database:\n{table_list}\n\n"
2567
+ f"To access a specific table, use the format:\n"
2568
+ f" {connection_string}::TABLE_NAME\n\n"
2569
+ f"Examples:\n"
2570
+ )
2571
+ # Add examples with first few table names
2572
+ for table in available_tables[:3]:
2573
+ error_msg += f" {connection_string}::{table}\n"
2574
+ else:
2575
+ error_msg = (
2576
+ f"No table specified in connection string: {connection_string}\n\n"
2577
+ f"No tables found in the database or unable to list tables.\n\n"
2578
+ f"To access a specific table, use the format:\n"
2579
+ f" {connection_string}::TABLE_NAME"
2580
+ )
2581
+
2582
+ raise ValueError(error_msg)
2583
+
2584
+ except Exception as e:
2585
+ if isinstance(e, ValueError):
2586
+ raise # Re-raise our custom ValueError
2587
+
2588
+ # Check for backend-specific errors and provide installation guidance
2589
+ error_str = str(e).lower()
2590
+ backend_install_map = {
2591
+ "duckdb": "pip install 'ibis-framework[duckdb]'",
2592
+ "postgresql": "pip install 'ibis-framework[postgres]'",
2593
+ "postgres": "pip install 'ibis-framework[postgres]'",
2594
+ "mysql": "pip install 'ibis-framework[mysql]'",
2595
+ "sqlite": "pip install 'ibis-framework[sqlite]'",
2596
+ "bigquery": "pip install 'ibis-framework[bigquery]'",
2597
+ "snowflake": "pip install 'ibis-framework[snowflake]'",
2598
+ }
2599
+
2600
+ # Check if this is a missing backend dependency
2601
+ for backend, install_cmd in backend_install_map.items():
2602
+ if backend in error_str and ("not found" in error_str or "no module" in error_str):
2603
+ raise ConnectionError(
2604
+ f"Missing {backend.upper()} backend for Ibis. Install it with:\n"
2605
+ f" {install_cmd}\n\n"
2606
+ f"Original error: {e}\n\n"
2607
+ f"Supported connection string formats:\n"
2608
+ f"- DuckDB: 'duckdb:///path/to/file.ddb::table_name'\n"
2609
+ f"- SQLite: 'sqlite:///path/to/file.db::table_name'\n"
2610
+ f"- PostgreSQL: 'postgresql://user:pass@host:port/db::table_name'\n"
2611
+ f"- MySQL: 'mysql://user:pass@host:port/db::table_name'\n"
2612
+ f"- BigQuery: 'bigquery://project/dataset::table_name'\n"
2613
+ f"- Snowflake: 'snowflake://user:pass@account/db/schema::table_name'\n"
2614
+ f"\nNote: Use '::table_name' to specify the table within the database."
2615
+ ) from e
2616
+
2617
+ # Generic connection error
2618
+ raise ConnectionError(
2619
+ f"Failed to connect to database using connection string: {connection_string}\n"
2620
+ f"Error: {e}\n\n"
2621
+ f"No table specified. Use the format: {connection_string}::TABLE_NAME"
2622
+ ) from e
2623
+
2624
+ # Split connection string and table name
2625
+ try:
2626
+ base_connection, table_name = connection_string.rsplit("::", 1)
2627
+ except ValueError:
2628
+ raise ValueError(f"Invalid connection string format: {connection_string}")
2629
+
2630
+ # Connect to database and get table
2631
+ try:
2632
+ conn = ibis.connect(base_connection)
2633
+ table = conn.table(table_name)
2634
+ return table
2635
+
2636
+ except Exception as e:
2637
+ # Check for backend-specific errors and provide installation guidance
2638
+ error_str = str(e).lower()
2639
+ backend_install_map = {
2640
+ "duckdb": "pip install 'ibis-framework[duckdb]'",
2641
+ "postgresql": "pip install 'ibis-framework[postgres]'",
2642
+ "postgres": "pip install 'ibis-framework[postgres]'",
2643
+ "mysql": "pip install 'ibis-framework[mysql]'",
2644
+ "sqlite": "pip install 'ibis-framework[sqlite]'",
2645
+ "bigquery": "pip install 'ibis-framework[bigquery]'",
2646
+ "snowflake": "pip install 'ibis-framework[snowflake]'",
2647
+ }
2648
+
2649
+ # Check if this is a missing backend dependency
2650
+ for backend, install_cmd in backend_install_map.items():
2651
+ if backend in error_str and ("not found" in error_str or "no module" in error_str):
2652
+ raise ConnectionError(
2653
+ f"Missing {backend.upper()} backend for Ibis. Install it with:\n"
2654
+ f" {install_cmd}\n\n"
2655
+ f"Original error: {e}"
2656
+ ) from e
2657
+
2658
+ # Check if table doesn't exist
2659
+ if "table" in error_str and ("not found" in error_str or "does not exist" in error_str):
2660
+ # Try to get available tables for helpful message
2661
+ try:
2662
+ available_tables = conn.list_tables()
2663
+ if available_tables:
2664
+ table_list = "\n".join(f" - {table}" for table in available_tables)
2665
+ raise ValueError(
2666
+ f"Table '{table_name}' not found in database.\n\n"
2667
+ f"Available tables:\n{table_list}\n\n"
2668
+ f"Check the table name and try again with:\n"
2669
+ f" {base_connection}::CORRECT_TABLE_NAME"
2670
+ ) from e
2671
+ else:
2672
+ raise ValueError(
2673
+ f"Table '{table_name}' not found and no tables available in database."
2674
+ ) from e
2675
+ except Exception:
2676
+ raise ValueError(
2677
+ f"Table '{table_name}' not found in database. "
2678
+ f"Check the table name and connection string."
2679
+ ) from e
2680
+
2681
+ # Generic connection error
2682
+ raise ConnectionError(
2683
+ f"Failed to connect to table '{table_name}' using: {base_connection}\nError: {e}"
2684
+ ) from e
2685
+
2686
+
1933
2687
  @dataclass
1934
2688
  class Validate:
1935
2689
  """
@@ -1962,8 +2716,14 @@ class Validate:
1962
2716
  Parameters
1963
2717
  ----------
1964
2718
  data
1965
- The table to validate, which could be a DataFrame object or an Ibis table object. Read the
1966
- *Supported Input Table Types* section for details on the supported table types.
2719
+ The table to validate, which could be a DataFrame object, an Ibis table object, a CSV
2720
+ file path, a Parquet file path, or a database connection string. When providing a CSV or
2721
+ Parquet file path (as a string or `pathlib.Path` object), the file will be automatically
2722
+ loaded using an available DataFrame library (Polars or Pandas). Parquet input also supports
2723
+ glob patterns, directories containing .parquet files, and Spark-style partitioned datasets.
2724
+ Connection strings enable direct database access via Ibis with optional table specification
2725
+ using the `::table_name` suffix. Read the *Supported Input Table Types* section for details
2726
+ on the supported table types.
1967
2727
  tbl_name
1968
2728
  An optional name to assign to the input table object. If no value is provided, a name will
1969
2729
  be generated based on whatever information is available. This table name will be displayed
@@ -2033,12 +2793,34 @@ class Validate:
2033
2793
  - PySpark table (`"pyspark"`)*
2034
2794
  - BigQuery table (`"bigquery"`)*
2035
2795
  - Parquet table (`"parquet"`)*
2796
+ - CSV files (string path or `pathlib.Path` object with `.csv` extension)
2797
+ - Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
2798
+ extension, or partitioned dataset)
2799
+ - Database connection strings (URI format with optional table specification)
2036
2800
 
2037
2801
  The table types marked with an asterisk need to be prepared as Ibis tables (with type of
2038
2802
  `ibis.expr.types.relations.Table`). Furthermore, the use of `Validate` with such tables requires
2039
2803
  the Ibis library v9.5.0 and above to be installed. If the input table is a Polars or Pandas
2040
2804
  DataFrame, the Ibis library is not required.
2041
2805
 
2806
+ To use a CSV file, ensure that a string or `pathlib.Path` object with a `.csv` extension is
2807
+ provided. The file will be automatically detected and loaded using the best available DataFrame
2808
+ library. The loading preference is Polars first, then Pandas as a fallback.
2809
+
2810
+ Connection strings follow database URL formats and must also specify a table using the
2811
+ `::table_name` suffix. Examples include:
2812
+
2813
+ ```
2814
+ "duckdb:///path/to/database.ddb::table_name"
2815
+ "sqlite:///path/to/database.db::table_name"
2816
+ "postgresql://user:password@localhost:5432/database::table_name"
2817
+ "mysql://user:password@localhost:3306/database::table_name"
2818
+ "bigquery://project/dataset::table_name"
2819
+ "snowflake://user:password@account/database/schema::table_name"
2820
+ ```
2821
+
2822
+ When using connection strings, the Ibis library with the appropriate backend driver is required.
2823
+
2042
2824
  Thresholds
2043
2825
  ----------
2044
2826
  The `thresholds=` parameter is used to set the failure-condition levels for all validation
@@ -2195,8 +2977,8 @@ class Validate:
2195
2977
  ```{python}
2196
2978
  import pointblank as pb
2197
2979
 
2198
- # Load the small_table dataset
2199
- small_table = pb.load_dataset()
2980
+ # Load the `small_table` dataset
2981
+ small_table = pb.load_dataset(dataset="small_table", tbl_type="polars")
2200
2982
 
2201
2983
  # Preview the table
2202
2984
  pb.preview(small_table)
@@ -2262,7 +3044,7 @@ class Validate:
2262
3044
  brief). Here's an example of a global setting for briefs:
2263
3045
 
2264
3046
  ```{python}
2265
- validation = (
3047
+ validation_2 = (
2266
3048
  pb.Validate(
2267
3049
  data=pb.load_dataset(),
2268
3050
  tbl_name="small_table",
@@ -2279,7 +3061,7 @@ class Validate:
2279
3061
  .interrogate()
2280
3062
  )
2281
3063
 
2282
- validation
3064
+ validation_2
2283
3065
  ```
2284
3066
 
2285
3067
  We see the text of the briefs appear in the `STEP` column of the reporting table. Furthermore,
@@ -2297,7 +3079,7 @@ class Validate:
2297
3079
  the data extracts for each validation step.
2298
3080
 
2299
3081
  ```{python}
2300
- validation.get_data_extracts()
3082
+ validation_2.get_data_extracts()
2301
3083
  ```
2302
3084
 
2303
3085
  We can also view step reports for each validation step using the
@@ -2305,7 +3087,7 @@ class Validate:
2305
3087
  type of validation step and shows the relevant information for a step's validation.
2306
3088
 
2307
3089
  ```{python}
2308
- validation.get_step_report(i=2)
3090
+ validation_2.get_step_report(i=2)
2309
3091
  ```
2310
3092
 
2311
3093
  The `Validate` class also has a method for getting the sundered data, which is the data that
@@ -2313,11 +3095,141 @@ class Validate:
2313
3095
  [`get_sundered_data()`](`pointblank.Validate.get_sundered_data`) method.
2314
3096
 
2315
3097
  ```{python}
2316
- pb.preview(validation.get_sundered_data())
3098
+ pb.preview(validation_2.get_sundered_data())
2317
3099
  ```
2318
3100
 
2319
3101
  The sundered data is a DataFrame that contains the rows that passed or failed the validation.
2320
3102
  The default behavior is to return the rows that failed the validation, as shown above.
3103
+
3104
+ ### Working with CSV Files
3105
+
3106
+ The `Validate` class can directly accept CSV file paths, making it easy to validate data stored
3107
+ in CSV files without manual loading:
3108
+
3109
+ ```{python}
3110
+ # Get a path to a CSV file from the package data
3111
+ csv_path = pb.get_data_path("global_sales", "csv")
3112
+
3113
+ validation_3 = (
3114
+ pb.Validate(
3115
+ data=csv_path,
3116
+ label="CSV validation example"
3117
+ )
3118
+ .col_exists(["customer_id", "product_id", "revenue"])
3119
+ .col_vals_not_null(["customer_id", "product_id"])
3120
+ .col_vals_gt(columns="revenue", value=0)
3121
+ .interrogate()
3122
+ )
3123
+
3124
+ validation_3
3125
+ ```
3126
+
3127
+ You can also use a Path object to specify the CSV file. Here's an example of how to do that:
3128
+
3129
+ ```{python}
3130
+ from pathlib import Path
3131
+
3132
+ csv_file = Path(pb.get_data_path("game_revenue", "csv"))
3133
+
3134
+ validation_4 = (
3135
+ pb.Validate(data=csv_file, label="Game Revenue Validation")
3136
+ .col_exists(["player_id", "session_id", "item_name"])
3137
+ .col_vals_regex(
3138
+ columns="session_id",
3139
+ pattern=r"[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}"
3140
+ )
3141
+ .col_vals_gt(columns="item_revenue", value=0, na_pass=True)
3142
+ .interrogate()
3143
+ )
3144
+
3145
+ validation_4
3146
+ ```
3147
+
3148
+ The CSV loading is automatic, so when a string or Path with a `.csv` extension is provided,
3149
+ Pointblank will automatically load the file using the best available DataFrame library (Polars
3150
+ preferred, Pandas as fallback). The loaded data can then be used with all validation methods
3151
+ just like any other supported table type.
3152
+
3153
+ ### Working with Parquet Files
3154
+
3155
+ The `Validate` class can directly accept Parquet files and datasets in various formats. The
3156
+ following examples illustrate how to validate Parquet files:
3157
+
3158
+ ```{python}
3159
+ # Single Parquet file from package data
3160
+ parquet_path = pb.get_data_path("nycflights", "parquet")
3161
+
3162
+ validation_5 = (
3163
+ pb.Validate(
3164
+ data=parquet_path,
3165
+ tbl_name="NYC Flights Data"
3166
+ )
3167
+ .col_vals_not_null(["carrier", "origin", "dest"])
3168
+ .col_vals_gt(columns="distance", value=0)
3169
+ .interrogate()
3170
+ )
3171
+
3172
+ validation_5
3173
+ ```
3174
+
3175
+ You can also use glob patterns and directories. Here are some examples for how to:
3176
+
3177
+ 1. load multiple Parquet files
3178
+ 2. load a Parquet-containing directory
3179
+ 3. load a partitioned Parquet dataset
3180
+
3181
+ ```python
3182
+ # Multiple Parquet files with glob patterns
3183
+ validation_6 = pb.Validate(data="data/sales_*.parquet")
3184
+
3185
+ # Directory containing Parquet files
3186
+ validation_7 = pb.Validate(data="parquet_data/")
3187
+
3188
+ # Partitioned Parquet dataset
3189
+ validation_8 = (
3190
+ pb.Validate(data="sales_data/") # Contains year=2023/quarter=Q1/region=US/sales.parquet
3191
+ .col_exists(["transaction_id", "amount", "year", "quarter", "region"])
3192
+ .interrogate()
3193
+ )
3194
+ ```
3195
+
3196
+ When you point to a directory that contains a partitioned Parquet dataset (with subdirectories
3197
+ like `year=2023/quarter=Q1/region=US/`), Pointblank will automatically:
3198
+
3199
+ - discover all Parquet files recursively
3200
+ - extract partition column values from directory paths
3201
+ - add partition columns to the final DataFrame
3202
+ - combine all partitions into a single table for validation
3203
+
3204
+ Both Polars and Pandas handle partitioned datasets natively, so this works seamlessly with
3205
+ either DataFrame library. The loading preference is Polars first, then Pandas as a fallback.
3206
+
3207
+ ### Working with Database Connection Strings
3208
+
3209
+ The `Validate` class supports database connection strings for direct validation of database
3210
+ tables. Connection strings must specify a table using the `::table_name` suffix:
3211
+
3212
+ ```{python}
3213
+ # Get path to a DuckDB database file from package data
3214
+ duckdb_path = pb.get_data_path("game_revenue", "duckdb")
3215
+
3216
+ validation_9 = (
3217
+ pb.Validate(
3218
+ data=f"duckdb:///{duckdb_path}::game_revenue",
3219
+ label="DuckDB Game Revenue Validation"
3220
+ )
3221
+ .col_exists(["player_id", "session_id", "item_revenue"])
3222
+ .col_vals_gt(columns="item_revenue", value=0)
3223
+ .interrogate()
3224
+ )
3225
+
3226
+ validation_9
3227
+ ```
3228
+
3229
+ For comprehensive documentation on supported connection string formats, error handling, and
3230
+ installation requirements, see the [`connect_to_table()`](`pointblank.connect_to_table`)
3231
+ function. This function handles all the connection logic and provides helpful error messages
3232
+ when table specifications are missing or backend dependencies are not installed.
2321
3233
  """
2322
3234
 
2323
3235
  data: FrameT | Any
@@ -2331,6 +3243,15 @@ class Validate:
2331
3243
  locale: str | None = None
2332
3244
 
2333
3245
  def __post_init__(self):
3246
+ # Handle connection string input for the data parameter
3247
+ self.data = _process_connection_string(self.data)
3248
+
3249
+ # Handle CSV file input for the data parameter
3250
+ self.data = _process_csv_input(self.data)
3251
+
3252
+ # Handle Parquet file input for the data parameter
3253
+ self.data = _process_parquet_input(self.data)
3254
+
2334
3255
  # Check input of the `thresholds=` argument
2335
3256
  _check_thresholds(thresholds=self.thresholds)
2336
3257
 
@@ -2506,12 +3427,16 @@ class Validate:
2506
3427
  (i.e., no validation steps will be created for them).
2507
3428
 
2508
3429
  A list with a combination of column names and tuples can be provided as well. This allows
2509
- for more complex segmentation scenarios. The following inputs are all valid:
3430
+ for more complex segmentation scenarios. The following inputs are both valid:
2510
3431
 
2511
- - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
2512
- in the `"region"` column and specific dates in the `"date"` column
2513
- - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
2514
- columns
3432
+ ```
3433
+ # Segments from all unique values in the `region` column
3434
+ # and specific dates in the `date` column
3435
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
3436
+
3437
+ # Segments from all unique values in the `region` and `date` columns
3438
+ segments=["region", "date"]
3439
+ ```
2515
3440
 
2516
3441
  The segmentation is performed during interrogation, and the resulting validation steps will
2517
3442
  be numbered sequentially. Each segment will have its own validation step, and the results
@@ -2794,12 +3719,16 @@ class Validate:
2794
3719
  (i.e., no validation steps will be created for them).
2795
3720
 
2796
3721
  A list with a combination of column names and tuples can be provided as well. This allows
2797
- for more complex segmentation scenarios. The following inputs are all valid:
3722
+ for more complex segmentation scenarios. The following inputs are both valid:
2798
3723
 
2799
- - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
2800
- in the `"region"` column and specific dates in the `"date"` column
2801
- - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
2802
- columns
3724
+ ```
3725
+ # Segments from all unique values in the `region` column
3726
+ # and specific dates in the `date` column
3727
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
3728
+
3729
+ # Segments from all unique values in the `region` and `date` columns
3730
+ segments=["region", "date"]
3731
+ ```
2803
3732
 
2804
3733
  The segmentation is performed during interrogation, and the resulting validation steps will
2805
3734
  be numbered sequentially. Each segment will have its own validation step, and the results
@@ -3081,12 +4010,16 @@ class Validate:
3081
4010
  (i.e., no validation steps will be created for them).
3082
4011
 
3083
4012
  A list with a combination of column names and tuples can be provided as well. This allows
3084
- for more complex segmentation scenarios. The following inputs are all valid:
4013
+ for more complex segmentation scenarios. The following inputs are both valid:
3085
4014
 
3086
- - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
3087
- in the `"region"` column and specific dates in the `"date"` column
3088
- - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
3089
- columns
4015
+ ```
4016
+ # Segments from all unique values in the `region` column
4017
+ # and specific dates in the `date` column
4018
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
4019
+
4020
+ # Segments from all unique values in the `region` and `date` columns
4021
+ segments=["region", "date"]
4022
+ ```
3090
4023
 
3091
4024
  The segmentation is performed during interrogation, and the resulting validation steps will
3092
4025
  be numbered sequentially. Each segment will have its own validation step, and the results
@@ -3367,12 +4300,16 @@ class Validate:
3367
4300
  (i.e., no validation steps will be created for them).
3368
4301
 
3369
4302
  A list with a combination of column names and tuples can be provided as well. This allows
3370
- for more complex segmentation scenarios. The following inputs are all valid:
4303
+ for more complex segmentation scenarios. The following inputs are both valid:
3371
4304
 
3372
- - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
3373
- in the `"region"` column and specific dates in the `"date"` column
3374
- - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
3375
- columns
4305
+ ```
4306
+ # Segments from all unique values in the `region` column
4307
+ # and specific dates in the `date` column
4308
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
4309
+
4310
+ # Segments from all unique values in the `region` and `date` columns
4311
+ segments=["region", "date"]
4312
+ ```
3376
4313
 
3377
4314
  The segmentation is performed during interrogation, and the resulting validation steps will
3378
4315
  be numbered sequentially. Each segment will have its own validation step, and the results
@@ -3651,12 +4588,16 @@ class Validate:
3651
4588
  (i.e., no validation steps will be created for them).
3652
4589
 
3653
4590
  A list with a combination of column names and tuples can be provided as well. This allows
3654
- for more complex segmentation scenarios. The following inputs are all valid:
4591
+ for more complex segmentation scenarios. The following inputs are both valid:
3655
4592
 
3656
- - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
3657
- in the `"region"` column and specific dates in the `"date"` column
3658
- - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
3659
- columns
4593
+ ```
4594
+ # Segments from all unique values in the `region` column
4595
+ # and specific dates in the `date` column
4596
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
4597
+
4598
+ # Segments from all unique values in the `region` and `date` columns
4599
+ segments=["region", "date"]
4600
+ ```
3660
4601
 
3661
4602
  The segmentation is performed during interrogation, and the resulting validation steps will
3662
4603
  be numbered sequentially. Each segment will have its own validation step, and the results
@@ -3939,12 +4880,16 @@ class Validate:
3939
4880
  (i.e., no validation steps will be created for them).
3940
4881
 
3941
4882
  A list with a combination of column names and tuples can be provided as well. This allows
3942
- for more complex segmentation scenarios. The following inputs are all valid:
4883
+ for more complex segmentation scenarios. The following inputs are both valid:
3943
4884
 
3944
- - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
3945
- in the `"region"` column and specific dates in the `"date"` column
3946
- - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
3947
- columns
4885
+ ```
4886
+ # Segments from all unique values in the `region` column
4887
+ # and specific dates in the `date` column
4888
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
4889
+
4890
+ # Segments from all unique values in the `region` and `date` columns
4891
+ segments=["region", "date"]
4892
+ ```
3948
4893
 
3949
4894
  The segmentation is performed during interrogation, and the resulting validation steps will
3950
4895
  be numbered sequentially. Each segment will have its own validation step, and the results
@@ -4241,12 +5186,16 @@ class Validate:
4241
5186
  (i.e., no validation steps will be created for them).
4242
5187
 
4243
5188
  A list with a combination of column names and tuples can be provided as well. This allows
4244
- for more complex segmentation scenarios. The following inputs are all valid:
5189
+ for more complex segmentation scenarios. The following inputs are both valid:
4245
5190
 
4246
- - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
4247
- in the `"region"` column and specific dates in the `"date"` column
4248
- - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
4249
- columns
5191
+ ```
5192
+ # Segments from all unique values in the `region` column
5193
+ # and specific dates in the `date` column
5194
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
5195
+
5196
+ # Segments from all unique values in the `region` and `date` columns
5197
+ segments=["region", "date"]
5198
+ ```
4250
5199
 
4251
5200
  The segmentation is performed during interrogation, and the resulting validation steps will
4252
5201
  be numbered sequentially. Each segment will have its own validation step, and the results
@@ -4557,12 +5506,16 @@ class Validate:
4557
5506
  (i.e., no validation steps will be created for them).
4558
5507
 
4559
5508
  A list with a combination of column names and tuples can be provided as well. This allows
4560
- for more complex segmentation scenarios. The following inputs are all valid:
5509
+ for more complex segmentation scenarios. The following inputs are both valid:
4561
5510
 
4562
- - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
4563
- in the `"region"` column and specific dates in the `"date"` column
4564
- - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
4565
- columns
5511
+ ```
5512
+ # Segments from all unique values in the `region` column
5513
+ # and specific dates in the `date` column
5514
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
5515
+
5516
+ # Segments from all unique values in the `region` and `date` columns
5517
+ segments=["region", "date"]
5518
+ ```
4566
5519
 
4567
5520
  The segmentation is performed during interrogation, and the resulting validation steps will
4568
5521
  be numbered sequentially. Each segment will have its own validation step, and the results
@@ -4829,12 +5782,16 @@ class Validate:
4829
5782
  (i.e., no validation steps will be created for them).
4830
5783
 
4831
5784
  A list with a combination of column names and tuples can be provided as well. This allows
4832
- for more complex segmentation scenarios. The following inputs are all valid:
5785
+ for more complex segmentation scenarios. The following inputs are both valid:
4833
5786
 
4834
- - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
4835
- in the `"region"` column and specific dates in the `"date"` column
4836
- - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
4837
- columns
5787
+ ```
5788
+ # Segments from all unique values in the `region` column
5789
+ # and specific dates in the `date` column
5790
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
5791
+
5792
+ # Segments from all unique values in the `region` and `date` columns
5793
+ segments=["region", "date"]
5794
+ ```
4838
5795
 
4839
5796
  The segmentation is performed during interrogation, and the resulting validation steps will
4840
5797
  be numbered sequentially. Each segment will have its own validation step, and the results
@@ -5082,12 +6039,16 @@ class Validate:
5082
6039
  (i.e., no validation steps will be created for them).
5083
6040
 
5084
6041
  A list with a combination of column names and tuples can be provided as well. This allows
5085
- for more complex segmentation scenarios. The following inputs are all valid:
6042
+ for more complex segmentation scenarios. The following inputs are both valid:
5086
6043
 
5087
- - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
5088
- in the `"region"` column and specific dates in the `"date"` column
5089
- - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
5090
- columns
6044
+ ```
6045
+ # Segments from all unique values in the `region` column
6046
+ # and specific dates in the `date` column
6047
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
6048
+
6049
+ # Segments from all unique values in the `region` and `date` columns
6050
+ segments=["region", "date"]
6051
+ ```
5091
6052
 
5092
6053
  The segmentation is performed during interrogation, and the resulting validation steps will
5093
6054
  be numbered sequentially. Each segment will have its own validation step, and the results
@@ -5243,9 +6204,9 @@ class Validate:
5243
6204
  active: bool = True,
5244
6205
  ) -> Validate:
5245
6206
  """
5246
- Validate whether values in a column are NULL.
6207
+ Validate whether values in a column are Null.
5247
6208
 
5248
- The `col_vals_null()` validation method checks whether column values in a table are NULL.
6209
+ The `col_vals_null()` validation method checks whether column values in a table are Null.
5249
6210
  This validation will operate over the number of test units that is equal to the number
5250
6211
  of rows in the table.
5251
6212
 
@@ -5326,12 +6287,16 @@ class Validate:
5326
6287
  (i.e., no validation steps will be created for them).
5327
6288
 
5328
6289
  A list with a combination of column names and tuples can be provided as well. This allows
5329
- for more complex segmentation scenarios. The following inputs are all valid:
6290
+ for more complex segmentation scenarios. The following inputs are both valid:
5330
6291
 
5331
- - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
5332
- in the `"region"` column and specific dates in the `"date"` column
5333
- - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
5334
- columns
6292
+ ```
6293
+ # Segments from all unique values in the `region` column
6294
+ # and specific dates in the `date` column
6295
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
6296
+
6297
+ # Segments from all unique values in the `region` and `date` columns
6298
+ segments=["region", "date"]
6299
+ ```
5335
6300
 
5336
6301
  The segmentation is performed during interrogation, and the resulting validation steps will
5337
6302
  be numbered sequentially. Each segment will have its own validation step, and the results
@@ -5482,10 +6447,10 @@ class Validate:
5482
6447
  active: bool = True,
5483
6448
  ) -> Validate:
5484
6449
  """
5485
- Validate whether values in a column are not NULL.
6450
+ Validate whether values in a column are not Null.
5486
6451
 
5487
6452
  The `col_vals_not_null()` validation method checks whether column values in a table are not
5488
- NULL. This validation will operate over the number of test units that is equal to the number
6453
+ Null. This validation will operate over the number of test units that is equal to the number
5489
6454
  of rows in the table.
5490
6455
 
5491
6456
  Parameters
@@ -5565,12 +6530,16 @@ class Validate:
5565
6530
  (i.e., no validation steps will be created for them).
5566
6531
 
5567
6532
  A list with a combination of column names and tuples can be provided as well. This allows
5568
- for more complex segmentation scenarios. The following inputs are all valid:
6533
+ for more complex segmentation scenarios. The following inputs are both valid:
5569
6534
 
5570
- - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
5571
- in the `"region"` column and specific dates in the `"date"` column
5572
- - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
5573
- columns
6535
+ ```
6536
+ # Segments from all unique values in the `region` column
6537
+ # and specific dates in the `date` column
6538
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
6539
+
6540
+ # Segments from all unique values in the `region` and `date` columns
6541
+ segments=["region", "date"]
6542
+ ```
5574
6543
 
5575
6544
  The segmentation is performed during interrogation, and the resulting validation steps will
5576
6545
  be numbered sequentially. Each segment will have its own validation step, and the results
@@ -5812,12 +6781,16 @@ class Validate:
5812
6781
  (i.e., no validation steps will be created for them).
5813
6782
 
5814
6783
  A list with a combination of column names and tuples can be provided as well. This allows
5815
- for more complex segmentation scenarios. The following inputs are all valid:
6784
+ for more complex segmentation scenarios. The following inputs are both valid:
5816
6785
 
5817
- - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
5818
- in the `"region"` column and specific dates in the `"date"` column
5819
- - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
5820
- columns
6786
+ ```
6787
+ # Segments from all unique values in the `region` column
6788
+ # and specific dates in the `date` column
6789
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
6790
+
6791
+ # Segments from all unique values in the `region` and `date` columns
6792
+ segments=["region", "date"]
6793
+ ```
5821
6794
 
5822
6795
  The segmentation is performed during interrogation, and the resulting validation steps will
5823
6796
  be numbered sequentially. Each segment will have its own validation step, and the results
@@ -6055,12 +7028,16 @@ class Validate:
6055
7028
  (i.e., no validation steps will be created for them).
6056
7029
 
6057
7030
  A list with a combination of column names and tuples can be provided as well. This allows
6058
- for more complex segmentation scenarios. The following inputs are all valid:
7031
+ for more complex segmentation scenarios. The following inputs are both valid:
6059
7032
 
6060
- - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
6061
- in the `"region"` column and specific dates in the `"date"` column
6062
- - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
6063
- columns
7033
+ ```
7034
+ # Segments from all unique values in the `region` column
7035
+ # and specific dates in the `date` column
7036
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
7037
+
7038
+ # Segments from all unique values in the `region` and `date` columns
7039
+ segments=["region", "date"]
7040
+ ```
6064
7041
 
6065
7042
  The segmentation is performed during interrogation, and the resulting validation steps will
6066
7043
  be numbered sequentially. Each segment will have its own validation step, and the results
@@ -6446,12 +7423,16 @@ class Validate:
6446
7423
  (i.e., no validation steps will be created for them).
6447
7424
 
6448
7425
  A list with a combination of column names and tuples can be provided as well. This allows
6449
- for more complex segmentation scenarios. The following inputs are all valid:
7426
+ for more complex segmentation scenarios. The following inputs are both valid:
6450
7427
 
6451
- - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
6452
- in the `"region"` column and specific dates in the `"date"` column
6453
- - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
6454
- columns
7428
+ ```
7429
+ # Segments from all unique values in the `region` column
7430
+ # and specific dates in the `date` column
7431
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
7432
+
7433
+ # Segments from all unique values in the `region` and `date` columns
7434
+ segments=["region", "date"]
7435
+ ```
6455
7436
 
6456
7437
  The segmentation is performed during interrogation, and the resulting validation steps will
6457
7438
  be numbered sequentially. Each segment will have its own validation step, and the results
@@ -6683,12 +7664,16 @@ class Validate:
6683
7664
  (i.e., no validation steps will be created for them).
6684
7665
 
6685
7666
  A list with a combination of column names and tuples can be provided as well. This allows
6686
- for more complex segmentation scenarios. The following inputs are all valid:
7667
+ for more complex segmentation scenarios. The following inputs are both valid:
6687
7668
 
6688
- - `segments=["region", ("date", ["2023-01-01", "2023-01-02"])]`: segments on unique values
6689
- in the `"region"` column and specific dates in the `"date"` column
6690
- - `segments=["region", "date"]`: segments on unique values in the `"region"` and `"date"`
6691
- columns
7669
+ ```
7670
+ # Segments from all unique values in the `region` column
7671
+ # and specific dates in the `date` column
7672
+ segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
7673
+
7674
+ # Segments from all unique values in the `region` and `date` columns
7675
+ segments=["region", "date"]
7676
+ ```
6692
7677
 
6693
7678
  The segmentation is performed during interrogation, and the resulting validation steps will
6694
7679
  be numbered sequentially. Each segment will have its own validation step, and the results
@@ -8241,37 +9226,47 @@ class Validate:
8241
9226
 
8242
9227
  # Determine whether any preprocessing functions are to be applied to the table
8243
9228
  if validation.pre is not None:
8244
- # Read the text of the preprocessing function
8245
- pre_text = _pre_processing_funcs_to_str(validation.pre)
9229
+ try:
9230
+ # Read the text of the preprocessing function
9231
+ pre_text = _pre_processing_funcs_to_str(validation.pre)
9232
+
9233
+ # Determine if the preprocessing function is a lambda function; return a boolean
9234
+ is_lambda = re.match(r"^lambda", pre_text) is not None
8246
9235
 
8247
- # Determine if the preprocessing function is a lambda function; return a boolean
8248
- is_lambda = re.match(r"^lambda", pre_text) is not None
9236
+ # If the preprocessing function is a lambda function, then check if there is
9237
+ # a keyword argument called `dfn` in the lamda signature; if so, that's a cue
9238
+ # to use a Narwhalified version of the table
9239
+ if is_lambda:
9240
+ # Get the signature of the lambda function
9241
+ sig = inspect.signature(validation.pre)
8249
9242
 
8250
- # If the preprocessing function is a lambda function, then check if there is
8251
- # a keyword argument called `dfn` in the lamda signature; if so, that's a cue
8252
- # to use a Narwhalified version of the table
8253
- if is_lambda:
8254
- # Get the signature of the lambda function
8255
- sig = inspect.signature(validation.pre)
9243
+ # Check if the lambda function has a keyword argument called `dfn`
9244
+ if "dfn" in sig.parameters:
9245
+ # Convert the table to a Narwhals DataFrame
9246
+ data_tbl_step = nw.from_native(data_tbl_step)
8256
9247
 
8257
- # Check if the lambda function has a keyword argument called `dfn`
8258
- if "dfn" in sig.parameters:
8259
- # Convert the table to a Narwhals DataFrame
8260
- data_tbl_step = nw.from_native(data_tbl_step)
9248
+ # Apply the preprocessing function to the table
9249
+ data_tbl_step = validation.pre(dfn=data_tbl_step)
8261
9250
 
8262
- # Apply the preprocessing function to the table
8263
- data_tbl_step = validation.pre(dfn=data_tbl_step)
9251
+ # Convert the table back to its original format
9252
+ data_tbl_step = nw.to_native(data_tbl_step)
8264
9253
 
8265
- # Convert the table back to its original format
8266
- data_tbl_step = nw.to_native(data_tbl_step)
9254
+ else:
9255
+ # Apply the preprocessing function to the table
9256
+ data_tbl_step = validation.pre(data_tbl_step)
8267
9257
 
8268
- else:
8269
- # Apply the preprocessing function to the table
9258
+ # If the preprocessing function is a function, apply it to the table
9259
+ elif isinstance(validation.pre, Callable):
8270
9260
  data_tbl_step = validation.pre(data_tbl_step)
8271
9261
 
8272
- # If the preprocessing function is a function, apply it to the table
8273
- elif isinstance(validation.pre, Callable):
8274
- data_tbl_step = validation.pre(data_tbl_step)
9262
+ except Exception:
9263
+ # If preprocessing fails, mark the validation as having an eval_error
9264
+ validation.eval_error = True
9265
+ end_time = datetime.datetime.now(datetime.timezone.utc)
9266
+ validation.proc_duration_s = (end_time - start_time).total_seconds()
9267
+ validation.time_processed = end_time.isoformat(timespec="milliseconds")
9268
+ validation.active = False
9269
+ continue
8275
9270
 
8276
9271
  # ------------------------------------------------
8277
9272
  # Segmentation stage
@@ -8284,12 +9279,28 @@ class Validate:
8284
9279
  data_tbl=data_tbl_step, segments_expr=validation.segments
8285
9280
  )
8286
9281
 
9282
+ # ------------------------------------------------
9283
+ # Determine table type and `collect()` if needed
9284
+ # ------------------------------------------------
9285
+
9286
+ if tbl_type not in IBIS_BACKENDS:
9287
+ tbl_type = "local"
9288
+
9289
+ # If the table is a lazy frame, we need to collect it
9290
+ if _is_lazy_frame(data_tbl_step):
9291
+ data_tbl_step = data_tbl_step.collect()
9292
+
9293
+ # ------------------------------------------------
9294
+ # Set the number of test units
9295
+ # ------------------------------------------------
9296
+
8287
9297
  validation.n = NumberOfTestUnits(df=data_tbl_step, column=column).get_test_units(
8288
9298
  tbl_type=tbl_type
8289
9299
  )
8290
9300
 
8291
- if tbl_type not in IBIS_BACKENDS:
8292
- tbl_type = "local"
9301
+ # ------------------------------------------------
9302
+ # Validation stage
9303
+ # ------------------------------------------------
8293
9304
 
8294
9305
  if assertion_category == "COMPARE_ONE":
8295
9306
  results_tbl = ColValsCompareOne(
@@ -8480,36 +9491,32 @@ class Validate:
8480
9491
 
8481
9492
  else:
8482
9493
  # If the result is not a list, then we assume it's a table in the conventional
8483
- # form (where the column is `pb_is_good_` exists, with boolean values)
8484
-
9494
+ # form (where the column is `pb_is_good_` exists, with boolean values
8485
9495
  results_tbl = results_tbl_list
8486
9496
 
8487
9497
  # If the results table is not `None`, then we assume there is a table with a column
8488
9498
  # called `pb_is_good_` that contains boolean values; we can then use this table to
8489
9499
  # determine the number of test units that passed and failed
8490
9500
  if results_tbl is not None:
8491
- # Extract the `pb_is_good_` column from the table as a results list
8492
- if tbl_type in IBIS_BACKENDS:
8493
- # Select the DataFrame library to use for getting the results list
8494
- df_lib = _select_df_lib(preference="polars")
8495
- df_lib_name = df_lib.__name__
8496
-
8497
- if df_lib_name == "pandas":
8498
- results_list = (
8499
- results_tbl.select("pb_is_good_").to_pandas()["pb_is_good_"].to_list()
8500
- )
8501
- else:
8502
- results_list = (
8503
- results_tbl.select("pb_is_good_").to_polars()["pb_is_good_"].to_list()
8504
- )
9501
+ # Count the number of passing and failing test units
9502
+ validation.n_passed = _count_true_values_in_column(
9503
+ tbl=results_tbl, column="pb_is_good_"
9504
+ )
9505
+ validation.n_failed = _count_true_values_in_column(
9506
+ tbl=results_tbl, column="pb_is_good_", inverse=True
9507
+ )
8505
9508
 
8506
- else:
8507
- results_list = nw.from_native(results_tbl)["pb_is_good_"].to_list()
9509
+ # Solely for the col_vals_in_set assertion type, any Null values in the
9510
+ # `pb_is_good_` column are counted as failing test units
9511
+ if assertion_type == "col_vals_in_set":
9512
+ null_count = _count_null_values_in_column(tbl=results_tbl, column="pb_is_good_")
9513
+ validation.n_failed += null_count
8508
9514
 
8509
- validation.all_passed = all(results_list)
8510
- validation.n = len(results_list)
8511
- validation.n_passed = results_list.count(True)
8512
- validation.n_failed = results_list.count(False)
9515
+ # For column-value validations, the number of test units is the number of rows
9516
+ validation.n = get_row_count(data=results_tbl)
9517
+
9518
+ # Set the `all_passed` attribute based on whether there are any failing test units
9519
+ validation.all_passed = validation.n_failed == 0
8513
9520
 
8514
9521
  # Calculate fractions of passing and failing test units
8515
9522
  # - `f_passed` is the fraction of test units that passed
@@ -9818,7 +10825,7 @@ class Validate:
9818
10825
  Get the 'critical' level status for each validation step.
9819
10826
 
9820
10827
  The 'critical' status for a validation step is `True` if the fraction of failing test units
9821
- meets or exceeds the threshold for the notification level. Otherwise, the status is `False`.
10828
+ meets or exceeds the threshold for the 'critical' level. Otherwise, the status is `False`.
9822
10829
 
9823
10830
  The ascribed name of 'critical' is semantic and is thus simply a status indicator that could
9824
10831
  be used to trigger some action to be take. Here's how it fits in with other status
@@ -9830,14 +10837,14 @@ class Validate:
9830
10837
  severity
9831
10838
  - 'critical': the status obtained by calling `critical()`, most severe
9832
10839
 
9833
- This method provides a dictionary of the notification status for each validation step. If
9834
- the `scalar=True` argument is provided and `i=` is a scalar, the value is returned as a
9835
- scalar instead of a dictionary.
10840
+ This method provides a dictionary of the 'critical' status for each validation step. If the
10841
+ `scalar=True` argument is provided and `i=` is a scalar, the value is returned as a scalar
10842
+ instead of a dictionary.
9836
10843
 
9837
10844
  Parameters
9838
10845
  ----------
9839
10846
  i
9840
- The validation step number(s) from which the notification status is obtained. Can be
10847
+ The validation step number(s) from which the 'critical' status is obtained. Can be
9841
10848
  provided as a list of integers or a single integer. If `None`, all steps are included.
9842
10849
  scalar
9843
10850
  If `True` and `i=` is a scalar, return the value as a scalar instead of a dictionary.
@@ -9845,7 +10852,7 @@ class Validate:
9845
10852
  Returns
9846
10853
  -------
9847
10854
  dict[int, bool] | bool
9848
- A dictionary of the notification status for each validation step or a scalar value.
10855
+ A dictionary of the 'critical' status for each validation step or a scalar value.
9849
10856
 
9850
10857
  Examples
9851
10858
  --------
@@ -9924,11 +10931,13 @@ class Validate:
9924
10931
  Get the rows that failed for each validation step.
9925
10932
 
9926
10933
  After the [`interrogate()`](`pointblank.Validate.interrogate`) method has been called, the
9927
- `get_data_extracts()` method can be used to extract the rows that failed in each row-based
9928
- validation step (e.g., [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`), etc.). The
9929
- method returns a dictionary of tables containing the rows that failed in every row-based
9930
- validation function. If `frame=True` and `i=` is a scalar, the value is conveniently
9931
- returned as a table (forgoing the dictionary structure).
10934
+ `get_data_extracts()` method can be used to extract the rows that failed in each
10935
+ column-value or row-based validation step (e.g.,
10936
+ [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`),
10937
+ [`rows_distinct()`](`pointblank.Validate.rows_distinct`), etc.). The method returns a
10938
+ dictionary of tables containing the rows that failed in every validation step. If
10939
+ `frame=True` and `i=` is a scalar, the value is conveniently returned as a table (forgoing
10940
+ the dictionary structure).
9932
10941
 
9933
10942
  Parameters
9934
10943
  ----------
@@ -9941,13 +10950,13 @@ class Validate:
9941
10950
  Returns
9942
10951
  -------
9943
10952
  dict[int, FrameT | None] | FrameT | None
9944
- A dictionary of tables containing the rows that failed in every row-based validation
9945
- step or a DataFrame.
10953
+ A dictionary of tables containing the rows that failed in every compatible validation
10954
+ step. Alternatively, it can be a DataFrame if `frame=True` and `i=` is a scalar.
9946
10955
 
9947
- Validation Methods that are Row-Based
9948
- -------------------------------------
9949
- The following validation methods are row-based and will have rows extracted when there are
9950
- failing test units.
10956
+ Compatible Validation Methods for Yielding Extracted Rows
10957
+ ---------------------------------------------------------
10958
+ The following validation methods operate on column values and will have rows extracted when
10959
+ there are failing test units.
9951
10960
 
9952
10961
  - [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`)
9953
10962
  - [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
@@ -9962,11 +10971,20 @@ class Validate:
9962
10971
  - [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
9963
10972
  - [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
9964
10973
  - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
10974
+ - [`col_vals_expr()`](`pointblank.Validate.col_vals_expr`)
10975
+ - [`conjointly()`](`pointblank.Validate.conjointly`)
10976
+
10977
+ An extracted row for these validation methods means that a test unit failed for that row in
10978
+ the validation step.
10979
+
10980
+ These row-based validation methods will also have rows extracted should there be failing
10981
+ rows:
10982
+
9965
10983
  - [`rows_distinct()`](`pointblank.Validate.rows_distinct`)
10984
+ - [`rows_complete()`](`pointblank.Validate.rows_complete`)
9966
10985
 
9967
- An extracted row means that a test unit failed for that row in the validation step. The
9968
- extracted rows are a subset of the original table and are useful for further analysis or for
9969
- understanding the nature of the failing test units.
10986
+ The extracted rows are a subset of the original table and are useful for further analysis
10987
+ or for understanding the nature of the failing test units.
9970
10988
 
9971
10989
  Examples
9972
10990
  --------
@@ -10222,10 +11240,10 @@ class Validate:
10222
11240
  Get the data that passed or failed the validation steps.
10223
11241
 
10224
11242
  Validation of the data is one thing but, sometimes, you want to use the best part of the
10225
- input dataset for something else. The `get_sundered_data()` method works with a Validate
11243
+ input dataset for something else. The `get_sundered_data()` method works with a `Validate`
10226
11244
  object that has been interrogated (i.e., the
10227
11245
  [`interrogate()`](`pointblank.Validate.interrogate`) method was used). We can get either the
10228
- 'pass' data piece (rows with no failing test units across all row-based validation
11246
+ 'pass' data piece (rows with no failing test units across all column-value based validation
10229
11247
  functions), or, the 'fail' data piece (rows with at least one failing test unit across the
10230
11248
  same series of validations).
10231
11249
 
@@ -10234,7 +11252,7 @@ class Validate:
10234
11252
  There are some caveats to sundering. The validation steps considered for this splitting will
10235
11253
  only involve steps where:
10236
11254
 
10237
- - of certain check types, where test units are cells checked row-by-row (e.g., the
11255
+ - of certain check types, where test units are cells checked down a column (e.g., the
10238
11256
  `col_vals_*()` methods)
10239
11257
  - `active=` is not set to `False`
10240
11258
  - `pre=` has not been given an expression for modifying the input table
@@ -10465,6 +11483,19 @@ class Validate:
10465
11483
  # Get information on the input data table
10466
11484
  tbl_info = _get_tbl_type(data=self.data)
10467
11485
 
11486
+ # If the table is a Polars one, determine if it's a LazyFrame
11487
+ if tbl_info == "polars":
11488
+ if _is_lazy_frame(self.data):
11489
+ tbl_info = "polars-lazy"
11490
+
11491
+ # Determine if the input table is a Narwhals DF
11492
+ if _is_narwhals_table(self.data):
11493
+ # Determine if the Narwhals table is a LazyFrame
11494
+ if _is_lazy_frame(self.data):
11495
+ tbl_info = "narwhals-lazy"
11496
+ else:
11497
+ tbl_info = "narwhals"
11498
+
10468
11499
  # Get the thresholds object
10469
11500
  thresholds = self.thresholds
10470
11501
 
@@ -10517,7 +11548,9 @@ class Validate:
10517
11548
  # Create the label, table type, and thresholds HTML fragments
10518
11549
  label_html = _create_label_html(label=self.label, start_time="")
10519
11550
  table_type_html = _create_table_type_html(tbl_type=tbl_info, tbl_name=self.tbl_name)
10520
- thresholds_html = _create_thresholds_html(thresholds=thresholds, locale=locale)
11551
+ thresholds_html = _create_thresholds_html(
11552
+ thresholds=thresholds, locale=locale, df_lib=df_lib
11553
+ )
10521
11554
 
10522
11555
  # Compose the subtitle HTML fragment
10523
11556
  combined_subtitle = (
@@ -10830,6 +11863,7 @@ class Validate:
10830
11863
  interrogation_performed=interrogation_performed,
10831
11864
  active=active,
10832
11865
  locale=locale,
11866
+ df_lib=df_lib,
10833
11867
  )
10834
11868
 
10835
11869
  # ------------------------------------------------
@@ -10846,6 +11880,7 @@ class Validate:
10846
11880
  interrogation_performed=interrogation_performed,
10847
11881
  active=active,
10848
11882
  locale=locale,
11883
+ df_lib=df_lib,
10849
11884
  )
10850
11885
 
10851
11886
  validation_info_dict["fail"] = _transform_passed_failed(
@@ -10854,6 +11889,7 @@ class Validate:
10854
11889
  interrogation_performed=interrogation_performed,
10855
11890
  active=active,
10856
11891
  locale=locale,
11892
+ df_lib=df_lib,
10857
11893
  )
10858
11894
 
10859
11895
  # ------------------------------------------------
@@ -11033,7 +12069,9 @@ class Validate:
11033
12069
  # Create the label, table type, and thresholds HTML fragments
11034
12070
  label_html = _create_label_html(label=self.label, start_time=self.time_start)
11035
12071
  table_type_html = _create_table_type_html(tbl_type=tbl_info, tbl_name=self.tbl_name)
11036
- thresholds_html = _create_thresholds_html(thresholds=thresholds, locale=locale)
12072
+ thresholds_html = _create_thresholds_html(
12073
+ thresholds=thresholds, locale=locale, df_lib=df_lib
12074
+ )
11037
12075
 
11038
12076
  # Compose the subtitle HTML fragment
11039
12077
  combined_subtitle = (
@@ -11291,24 +12329,25 @@ class Validate:
11291
12329
  Types of Step Reports
11292
12330
  ---------------------
11293
12331
  The `get_step_report()` method produces a report based on the *type* of validation step.
11294
- The following row-based validation methods will produce a report that shows the rows of the
11295
- data that failed because of failing test units within one or more columns failed:
12332
+ The following column-value or row-based validation step validation methods will produce a
12333
+ report that shows the rows of the data that failed:
11296
12334
 
11297
12335
  - [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`)
12336
+ - [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
11298
12337
  - [`col_vals_lt()`](`pointblank.Validate.col_vals_lt`)
12338
+ - [`col_vals_le()`](`pointblank.Validate.col_vals_le`)
11299
12339
  - [`col_vals_eq()`](`pointblank.Validate.col_vals_eq`)
11300
12340
  - [`col_vals_ne()`](`pointblank.Validate.col_vals_ne`)
11301
- - [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
11302
- - [`col_vals_le()`](`pointblank.Validate.col_vals_le`)
11303
12341
  - [`col_vals_between()`](`pointblank.Validate.col_vals_between`)
11304
12342
  - [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
11305
12343
  - [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
11306
12344
  - [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
11307
- - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
11308
12345
  - [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
11309
12346
  - [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
11310
- - [`rows_complete()`](`pointblank.Validate.rows_complete`)
12347
+ - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
12348
+ - [`col_vals_expr()`](`pointblank.Validate.col_vals_expr`)
11311
12349
  - [`conjointly()`](`pointblank.Validate.conjointly`)
12350
+ - [`rows_complete()`](`pointblank.Validate.rows_complete`)
11312
12351
 
11313
12352
  The [`rows_distinct()`](`pointblank.Validate.rows_distinct`) validation step will produce a
11314
12353
  report that shows duplicate rows (or duplicate values in one or a set of columns as defined
@@ -12835,20 +13874,78 @@ def _transform_eval(
12835
13874
  return symbol_list
12836
13875
 
12837
13876
 
13877
+ def _format_numbers_with_gt(
13878
+ values: list[int], n_sigfig: int = 3, compact: bool = True, locale: str = "en"
13879
+ ) -> list[str]:
13880
+ """Format numbers using Great Tables GT object to avoid pandas dependency."""
13881
+ import polars as pl
13882
+
13883
+ # Create a single-column DataFrame with all values
13884
+ df = pl.DataFrame({"values": values})
13885
+
13886
+ # Create GT object and format the column
13887
+ gt_obj = GT(df).fmt_number(columns="values", n_sigfig=n_sigfig, compact=compact, locale=locale)
13888
+
13889
+ # Extract the formatted values using _get_column_of_values
13890
+ formatted_values = _get_column_of_values(gt_obj, column_name="values", context="html")
13891
+
13892
+ return formatted_values
13893
+
13894
+
13895
+ def _format_single_number_with_gt(
13896
+ value: int, n_sigfig: int = 3, compact: bool = True, locale: str = "en", df_lib=None
13897
+ ) -> str:
13898
+ """Format a single number using Great Tables GT object to avoid pandas dependency."""
13899
+ if df_lib is None:
13900
+ # Use library detection to select appropriate DataFrame library
13901
+ if _is_lib_present("polars"):
13902
+ import polars as pl
13903
+
13904
+ df_lib = pl
13905
+ elif _is_lib_present("pandas"):
13906
+ import pandas as pd
13907
+
13908
+ df_lib = pd
13909
+ else:
13910
+ raise ImportError("Neither Polars nor Pandas is available for formatting")
13911
+
13912
+ # Create a single-row, single-column DataFrame using the specified library
13913
+ df = df_lib.DataFrame({"value": [value]})
13914
+
13915
+ # Create GT object and format the column
13916
+ gt_obj = GT(df).fmt_number(columns="value", n_sigfig=n_sigfig, compact=compact, locale=locale)
13917
+
13918
+ # Extract the formatted value using _get_column_of_values
13919
+ formatted_values = _get_column_of_values(gt_obj, column_name="value", context="html")
13920
+
13921
+ return formatted_values[0] # Return the single formatted value
13922
+
13923
+
12838
13924
  def _transform_test_units(
12839
- test_units: list[int], interrogation_performed: bool, active: list[bool], locale: str
13925
+ test_units: list[int],
13926
+ interrogation_performed: bool,
13927
+ active: list[bool],
13928
+ locale: str,
13929
+ df_lib=None,
12840
13930
  ) -> list[str]:
12841
13931
  # If no interrogation was performed, return a list of empty strings
12842
13932
  if not interrogation_performed:
12843
13933
  return ["" for _ in range(len(test_units))]
12844
13934
 
13935
+ # Define the helper function that'll format numbers safely with Great Tables
13936
+ def _format_number_safe(value: int) -> str:
13937
+ if df_lib is not None:
13938
+ # Use GT-based formatting to avoid Pandas dependency completely
13939
+ return _format_single_number_with_gt(
13940
+ value, n_sigfig=3, compact=True, locale=locale, df_lib=df_lib
13941
+ )
13942
+ else:
13943
+ # Fallback to the original behavior
13944
+ return str(vals.fmt_number(value, n_sigfig=3, compact=True, locale=locale)[0])
13945
+
12845
13946
  return [
12846
13947
  (
12847
- (
12848
- str(test_units[i])
12849
- if test_units[i] < 10000
12850
- else str(vals.fmt_number(test_units[i], n_sigfig=3, compact=True, locale=locale)[0])
12851
- )
13948
+ (str(test_units[i]) if test_units[i] < 10000 else _format_number_safe(test_units[i]))
12852
13949
  if active[i]
12853
13950
  else "&mdash;"
12854
13951
  )
@@ -12856,8 +13953,43 @@ def _transform_test_units(
12856
13953
  ]
12857
13954
 
12858
13955
 
12859
- def _fmt_lg(value: int, locale: str) -> str:
12860
- return vals.fmt_number(value, n_sigfig=3, compact=True, locale=locale)[0]
13956
+ def _fmt_lg(value: int, locale: str, df_lib=None) -> str:
13957
+ if df_lib is not None:
13958
+ # Use GT-based formatting if a DataFrame library is provided
13959
+ return _format_single_number_with_gt(
13960
+ value, n_sigfig=3, compact=True, locale=locale, df_lib=df_lib
13961
+ )
13962
+ else:
13963
+ # Fallback to the original behavior
13964
+ return vals.fmt_number(value, n_sigfig=3, compact=True, locale=locale)[0]
13965
+
13966
+
13967
+ def _format_single_float_with_gt(
13968
+ value: float, decimals: int = 2, locale: str = "en", df_lib=None
13969
+ ) -> str:
13970
+ if df_lib is None:
13971
+ # Use library detection to select appropriate DataFrame library
13972
+ if _is_lib_present("polars"):
13973
+ import polars as pl
13974
+
13975
+ df_lib = pl
13976
+ elif _is_lib_present("pandas"):
13977
+ import pandas as pd
13978
+
13979
+ df_lib = pd
13980
+ else:
13981
+ raise ImportError("Neither Polars nor Pandas is available for formatting")
13982
+
13983
+ # Create a single-row, single-column DataFrame using the specified library
13984
+ df = df_lib.DataFrame({"value": [value]})
13985
+
13986
+ # Create GT object and format the column
13987
+ gt_obj = GT(df).fmt_number(columns="value", decimals=decimals, locale=locale)
13988
+
13989
+ # Extract the formatted value using _get_column_of_values
13990
+ formatted_values = _get_column_of_values(gt_obj, column_name="value", context="html")
13991
+
13992
+ return formatted_values[0] # Return the single formatted value
12861
13993
 
12862
13994
 
12863
13995
  def _transform_passed_failed(
@@ -12866,14 +13998,24 @@ def _transform_passed_failed(
12866
13998
  interrogation_performed: bool,
12867
13999
  active: list[bool],
12868
14000
  locale: str,
14001
+ df_lib=None,
12869
14002
  ) -> list[str]:
12870
14003
  if not interrogation_performed:
12871
14004
  return ["" for _ in range(len(n_passed_failed))]
12872
14005
 
14006
+ # Helper function to format numbers safely
14007
+ def _format_float_safe(value: float) -> str:
14008
+ if df_lib is not None:
14009
+ # Use GT-based formatting to avoid Pandas dependency completely
14010
+ return _format_single_float_with_gt(value, decimals=2, locale=locale, df_lib=df_lib)
14011
+ else:
14012
+ # Fallback to the original behavior
14013
+ return vals.fmt_number(value, decimals=2, locale=locale)[0]
14014
+
12873
14015
  passed_failed = [
12874
14016
  (
12875
- f"{n_passed_failed[i] if n_passed_failed[i] < 10000 else _fmt_lg(n_passed_failed[i], locale=locale)}"
12876
- f"<br />{vals.fmt_number(f_passed_failed[i], decimals=2, locale=locale)[0]}"
14017
+ f"{n_passed_failed[i] if n_passed_failed[i] < 10000 else _fmt_lg(n_passed_failed[i], locale=locale, df_lib=df_lib)}"
14018
+ f"<br />{_format_float_safe(f_passed_failed[i])}"
12877
14019
  if active[i]
12878
14020
  else "&mdash;"
12879
14021
  )
@@ -13084,41 +14226,122 @@ def _create_label_html(label: str | None, start_time: str) -> str:
13084
14226
  )
13085
14227
 
13086
14228
 
13087
- def _create_thresholds_html(thresholds: Thresholds, locale: str) -> str:
14229
+ def _format_single_integer_with_gt(value: int, locale: str = "en", df_lib=None) -> str:
14230
+ """Format a single integer using Great Tables GT object to avoid pandas dependency."""
14231
+ if df_lib is None:
14232
+ # Use library detection to select appropriate DataFrame library
14233
+ if _is_lib_present("polars"):
14234
+ import polars as pl
14235
+
14236
+ df_lib = pl
14237
+ elif _is_lib_present("pandas"):
14238
+ import pandas as pd
14239
+
14240
+ df_lib = pd
14241
+ else:
14242
+ raise ImportError("Neither Polars nor Pandas is available for formatting")
14243
+
14244
+ # Create a single-row, single-column DataFrame using the specified library
14245
+ df = df_lib.DataFrame({"value": [value]})
14246
+
14247
+ # Create GT object and format the column
14248
+ gt_obj = GT(df).fmt_integer(columns="value", locale=locale)
14249
+
14250
+ # Extract the formatted value using _get_column_of_values
14251
+ formatted_values = _get_column_of_values(gt_obj, column_name="value", context="html")
14252
+
14253
+ return formatted_values[0] # Return the single formatted value
14254
+
14255
+
14256
+ def _format_single_float_with_gt_custom(
14257
+ value: float,
14258
+ decimals: int = 2,
14259
+ drop_trailing_zeros: bool = False,
14260
+ locale: str = "en",
14261
+ df_lib=None,
14262
+ ) -> str:
14263
+ """Format a single float with custom options using Great Tables GT object to avoid pandas dependency."""
14264
+ if df_lib is None:
14265
+ # Use library detection to select appropriate DataFrame library
14266
+ if _is_lib_present("polars"):
14267
+ import polars as pl
14268
+
14269
+ df_lib = pl
14270
+ elif _is_lib_present("pandas"):
14271
+ import pandas as pd
14272
+
14273
+ df_lib = pd
14274
+ else:
14275
+ raise ImportError("Neither Polars nor Pandas is available for formatting")
14276
+
14277
+ # Create a single-row, single-column DataFrame using the specified library
14278
+ df = df_lib.DataFrame({"value": [value]})
14279
+
14280
+ # Create GT object and format the column
14281
+ gt_obj = GT(df).fmt_number(
14282
+ columns="value", decimals=decimals, drop_trailing_zeros=drop_trailing_zeros, locale=locale
14283
+ )
14284
+
14285
+ # Extract the formatted value using _get_column_of_values
14286
+ formatted_values = _get_column_of_values(gt_obj, column_name="value", context="html")
14287
+
14288
+ return formatted_values[0] # Return the single formatted value
14289
+
14290
+
14291
+ def _create_thresholds_html(thresholds: Thresholds, locale: str, df_lib=None) -> str:
13088
14292
  if thresholds == Thresholds():
13089
14293
  return ""
13090
14294
 
14295
+ # Helper functions to format numbers safely
14296
+ def _format_number_safe(value: float, decimals: int, drop_trailing_zeros: bool = False) -> str:
14297
+ if df_lib is not None and value is not None:
14298
+ # Use GT-based formatting to avoid Pandas dependency completely
14299
+ return _format_single_float_with_gt_custom(
14300
+ value,
14301
+ decimals=decimals,
14302
+ drop_trailing_zeros=drop_trailing_zeros,
14303
+ locale=locale,
14304
+ df_lib=df_lib,
14305
+ )
14306
+ else:
14307
+ # Fallback to the original behavior
14308
+ return fmt_number(
14309
+ value, decimals=decimals, drop_trailing_zeros=drop_trailing_zeros, locale=locale
14310
+ )[0]
14311
+
14312
+ def _format_integer_safe(value: int) -> str:
14313
+ if df_lib is not None and value is not None:
14314
+ # Use GT-based formatting to avoid Pandas dependency completely
14315
+ return _format_single_integer_with_gt(value, locale=locale, df_lib=df_lib)
14316
+ else:
14317
+ # Fallback to the original behavior
14318
+ return fmt_integer(value, locale=locale)[0]
14319
+
13091
14320
  warning = (
13092
- fmt_number(
13093
- thresholds.warning_fraction, decimals=3, drop_trailing_zeros=True, locale=locale
13094
- )[0]
14321
+ _format_number_safe(thresholds.warning_fraction, decimals=3, drop_trailing_zeros=True)
13095
14322
  if thresholds.warning_fraction is not None
13096
14323
  else (
13097
- fmt_integer(thresholds.warning_count, locale=locale)[0]
14324
+ _format_integer_safe(thresholds.warning_count)
13098
14325
  if thresholds.warning_count is not None
13099
14326
  else "&mdash;"
13100
14327
  )
13101
14328
  )
13102
14329
 
13103
14330
  error = (
13104
- fmt_number(thresholds.error_fraction, decimals=3, drop_trailing_zeros=True, locale=locale)[
13105
- 0
13106
- ]
14331
+ _format_number_safe(thresholds.error_fraction, decimals=3, drop_trailing_zeros=True)
13107
14332
  if thresholds.error_fraction is not None
13108
14333
  else (
13109
- fmt_integer(thresholds.error_count, locale=locale)[0]
14334
+ _format_integer_safe(thresholds.error_count)
13110
14335
  if thresholds.error_count is not None
13111
14336
  else "&mdash;"
13112
14337
  )
13113
14338
  )
13114
14339
 
13115
14340
  critical = (
13116
- fmt_number(
13117
- thresholds.critical_fraction, decimals=3, drop_trailing_zeros=True, locale=locale
13118
- )[0]
14341
+ _format_number_safe(thresholds.critical_fraction, decimals=3, drop_trailing_zeros=True)
13119
14342
  if thresholds.critical_fraction is not None
13120
14343
  else (
13121
- fmt_integer(thresholds.critical_count, locale=locale)[0]
14344
+ _format_integer_safe(thresholds.critical_count)
13122
14345
  if thresholds.critical_count is not None
13123
14346
  else "&mdash;"
13124
14347
  )