pointblank 0.9.5__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +4 -0
- pointblank/_constants.py +6 -0
- pointblank/_datascan_utils.py +65 -0
- pointblank/_utils.py +128 -0
- pointblank/_utils_html.py +40 -0
- pointblank/actions.py +3 -3
- pointblank/assistant.py +1 -3
- pointblank/column.py +4 -4
- pointblank/compare.py +27 -0
- pointblank/data/api-docs.txt +769 -138
- pointblank/datascan.py +318 -959
- pointblank/scan_profile.py +321 -0
- pointblank/scan_profile_stats.py +180 -0
- pointblank/schema.py +14 -3
- pointblank/thresholds.py +2 -2
- pointblank/validate.py +1594 -207
- {pointblank-0.9.5.dist-info → pointblank-0.10.0.dist-info}/METADATA +6 -3
- pointblank-0.10.0.dist-info/RECORD +37 -0
- {pointblank-0.9.5.dist-info → pointblank-0.10.0.dist-info}/WHEEL +1 -1
- pointblank-0.9.5.dist-info/RECORD +0 -33
- {pointblank-0.9.5.dist-info → pointblank-0.10.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.9.5.dist-info → pointblank-0.10.0.dist-info}/top_level.txt +0 -0
pointblank/validate.py
CHANGED
|
@@ -17,6 +17,7 @@ from zipfile import ZipFile
|
|
|
17
17
|
import commonmark
|
|
18
18
|
import narwhals as nw
|
|
19
19
|
from great_tables import GT, from_column, google_font, html, loc, md, style, vals
|
|
20
|
+
from great_tables.gt import _get_column_of_values
|
|
20
21
|
from great_tables.vals import fmt_integer, fmt_number
|
|
21
22
|
from importlib_resources import files
|
|
22
23
|
from narwhals.typing import FrameT
|
|
@@ -64,11 +65,15 @@ from pointblank._typing import SegmentSpec
|
|
|
64
65
|
from pointblank._utils import (
|
|
65
66
|
_check_any_df_lib,
|
|
66
67
|
_check_invalid_fields,
|
|
68
|
+
_count_null_values_in_column,
|
|
69
|
+
_count_true_values_in_column,
|
|
67
70
|
_derive_bounds,
|
|
68
71
|
_format_to_integer_value,
|
|
69
72
|
_get_fn_name,
|
|
70
73
|
_get_tbl_type,
|
|
74
|
+
_is_lazy_frame,
|
|
71
75
|
_is_lib_present,
|
|
76
|
+
_is_narwhals_table,
|
|
72
77
|
_is_value_a_df,
|
|
73
78
|
_select_df_lib,
|
|
74
79
|
)
|
|
@@ -99,11 +104,13 @@ __all__ = [
|
|
|
99
104
|
"Validate",
|
|
100
105
|
"load_dataset",
|
|
101
106
|
"config",
|
|
107
|
+
"connect_to_table",
|
|
102
108
|
"preview",
|
|
103
109
|
"missing_vals_tbl",
|
|
110
|
+
"get_action_metadata",
|
|
104
111
|
"get_column_count",
|
|
112
|
+
"get_data_path",
|
|
105
113
|
"get_row_count",
|
|
106
|
-
"get_action_metadata",
|
|
107
114
|
"get_validation_summary",
|
|
108
115
|
]
|
|
109
116
|
|
|
@@ -495,7 +502,9 @@ def load_dataset(
|
|
|
495
502
|
raise ValueError(
|
|
496
503
|
f"The dataset name `{dataset}` is not valid. Choose one of the following:\n"
|
|
497
504
|
"- `small_table`\n"
|
|
498
|
-
"- `game_revenue
|
|
505
|
+
"- `game_revenue`\n"
|
|
506
|
+
"- `nycflights`\n"
|
|
507
|
+
"- `global_sales`"
|
|
499
508
|
)
|
|
500
509
|
|
|
501
510
|
# Raise an error if the `tbl_type=` value is not of the supported types
|
|
@@ -560,6 +569,405 @@ def load_dataset(
|
|
|
560
569
|
return dataset
|
|
561
570
|
|
|
562
571
|
|
|
572
|
+
def get_data_path(
|
|
573
|
+
dataset: Literal["small_table", "game_revenue", "nycflights", "global_sales"] = "small_table",
|
|
574
|
+
file_type: Literal["csv", "parquet", "duckdb"] = "csv",
|
|
575
|
+
) -> str:
|
|
576
|
+
"""
|
|
577
|
+
Get the file path to a dataset included with the Pointblank package.
|
|
578
|
+
|
|
579
|
+
This function provides direct access to the file paths of datasets included with Pointblank.
|
|
580
|
+
These paths can be used in examples and documentation to demonstrate file-based data loading
|
|
581
|
+
without requiring the actual data files. The returned paths can be used with
|
|
582
|
+
`Validate(data=path)` to demonstrate CSV and Parquet file loading capabilities.
|
|
583
|
+
|
|
584
|
+
Parameters
|
|
585
|
+
----------
|
|
586
|
+
dataset
|
|
587
|
+
The name of the dataset to get the path for. Current options are `"small_table"`,
|
|
588
|
+
`"game_revenue"`, `"nycflights"`, and `"global_sales"`.
|
|
589
|
+
file_type
|
|
590
|
+
The file format to get the path for. Options are `"csv"`, `"parquet"`, or `"duckdb"`.
|
|
591
|
+
|
|
592
|
+
Returns
|
|
593
|
+
-------
|
|
594
|
+
str
|
|
595
|
+
The file path to the requested dataset file.
|
|
596
|
+
|
|
597
|
+
Included Datasets
|
|
598
|
+
-----------------
|
|
599
|
+
The available datasets are the same as those in [`load_dataset()`](`pointblank.load_dataset`):
|
|
600
|
+
|
|
601
|
+
- `"small_table"`: A small dataset with 13 rows and 8 columns. Ideal for testing and examples.
|
|
602
|
+
- `"game_revenue"`: A dataset with 2000 rows and 11 columns. Revenue data for a game company.
|
|
603
|
+
- `"nycflights"`: A dataset with 336,776 rows and 18 columns. Flight data from NYC airports.
|
|
604
|
+
- `"global_sales"`: A dataset with 50,000 rows and 20 columns. Global sales data across regions.
|
|
605
|
+
|
|
606
|
+
File Types
|
|
607
|
+
----------
|
|
608
|
+
Each dataset is available in multiple formats:
|
|
609
|
+
|
|
610
|
+
- `"csv"`: Comma-separated values file (`.csv`)
|
|
611
|
+
- `"parquet"`: Parquet file (`.parquet`)
|
|
612
|
+
- `"duckdb"`: DuckDB database file (`.ddb`)
|
|
613
|
+
|
|
614
|
+
Examples
|
|
615
|
+
--------
|
|
616
|
+
Get the path to a CSV file and use it with `Validate`:
|
|
617
|
+
|
|
618
|
+
```{python}
|
|
619
|
+
import pointblank as pb
|
|
620
|
+
|
|
621
|
+
# Get path to the small_table CSV file
|
|
622
|
+
csv_path = pb.get_data_path("small_table", "csv")
|
|
623
|
+
print(csv_path)
|
|
624
|
+
|
|
625
|
+
# Use the path directly with Validate
|
|
626
|
+
validation = (
|
|
627
|
+
pb.Validate(data=csv_path)
|
|
628
|
+
.col_exists(["a", "b", "c"])
|
|
629
|
+
.col_vals_gt(columns="d", value=0)
|
|
630
|
+
.interrogate()
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
validation
|
|
634
|
+
```
|
|
635
|
+
|
|
636
|
+
Get a Parquet file path for validation examples:
|
|
637
|
+
|
|
638
|
+
```{python}
|
|
639
|
+
# Get path to the game_revenue Parquet file
|
|
640
|
+
parquet_path = pb.get_data_path(dataset="game_revenue", file_type="parquet")
|
|
641
|
+
|
|
642
|
+
# Validate the Parquet file directly
|
|
643
|
+
validation = (
|
|
644
|
+
pb.Validate(data=parquet_path, label="Game Revenue Data Validation")
|
|
645
|
+
.col_vals_not_null(columns=["player_id", "session_id"])
|
|
646
|
+
.col_vals_gt(columns="item_revenue", value=0)
|
|
647
|
+
.interrogate()
|
|
648
|
+
)
|
|
649
|
+
|
|
650
|
+
validation
|
|
651
|
+
```
|
|
652
|
+
|
|
653
|
+
This is particularly useful for documentation examples where you want to demonstrate
|
|
654
|
+
file-based workflows without requiring users to have specific data files:
|
|
655
|
+
|
|
656
|
+
```{python}
|
|
657
|
+
# Example showing CSV file validation
|
|
658
|
+
sales_csv = pb.get_data_path(dataset="global_sales", file_type="csv")
|
|
659
|
+
|
|
660
|
+
validation = (
|
|
661
|
+
pb.Validate(data=sales_csv, label="Sales Data Validation")
|
|
662
|
+
.col_exists(["customer_id", "product_id", "amount"])
|
|
663
|
+
.col_vals_regex(columns="customer_id", pattern=r"CUST_[0-9]{6}")
|
|
664
|
+
.interrogate()
|
|
665
|
+
)
|
|
666
|
+
```
|
|
667
|
+
|
|
668
|
+
See Also
|
|
669
|
+
--------
|
|
670
|
+
[`load_dataset()`](`pointblank.load_dataset`) for loading datasets directly as table objects.
|
|
671
|
+
"""
|
|
672
|
+
|
|
673
|
+
# Validate inputs
|
|
674
|
+
if dataset not in ["small_table", "game_revenue", "nycflights", "global_sales"]:
|
|
675
|
+
raise ValueError(
|
|
676
|
+
f"The dataset name `{dataset}` is not valid. Choose one of the following:\n"
|
|
677
|
+
"- `small_table`\n"
|
|
678
|
+
"- `game_revenue`\n"
|
|
679
|
+
"- `nycflights`\n"
|
|
680
|
+
"- `global_sales`"
|
|
681
|
+
)
|
|
682
|
+
|
|
683
|
+
if file_type not in ["csv", "parquet", "duckdb"]:
|
|
684
|
+
raise ValueError(
|
|
685
|
+
f"The file type `{file_type}` is not valid. Choose one of the following:\n"
|
|
686
|
+
"- `csv`\n"
|
|
687
|
+
"- `parquet`\n"
|
|
688
|
+
"- `duckdb`"
|
|
689
|
+
)
|
|
690
|
+
|
|
691
|
+
if file_type == "csv":
|
|
692
|
+
# Return path to CSV file inside the zip
|
|
693
|
+
data_path = files("pointblank.data") / f"{dataset}.zip"
|
|
694
|
+
|
|
695
|
+
# For CSV files, we need to extract from zip to a temporary location
|
|
696
|
+
# since most libraries expect actual file paths, not zip contents
|
|
697
|
+
with tempfile.NamedTemporaryFile(mode="wb", suffix=".csv", delete=False) as tmp_file:
|
|
698
|
+
with ZipFile(data_path) as zip_file:
|
|
699
|
+
csv_content = zip_file.read(f"{dataset}.csv")
|
|
700
|
+
tmp_file.write(csv_content)
|
|
701
|
+
return tmp_file.name
|
|
702
|
+
|
|
703
|
+
elif file_type == "parquet":
|
|
704
|
+
# Create a temporary parquet file from the CSV data
|
|
705
|
+
data_path = files("pointblank.data") / f"{dataset}.zip"
|
|
706
|
+
|
|
707
|
+
# We'll need to convert CSV to Parquet temporarily
|
|
708
|
+
with tempfile.NamedTemporaryFile(mode="wb", suffix=".parquet", delete=False) as tmp_file:
|
|
709
|
+
# Load CSV data and save as Parquet
|
|
710
|
+
if _is_lib_present(lib_name="polars"):
|
|
711
|
+
import polars as pl
|
|
712
|
+
|
|
713
|
+
df = pl.read_csv(ZipFile(data_path).read(f"{dataset}.csv"), try_parse_dates=True)
|
|
714
|
+
df.write_parquet(tmp_file.name)
|
|
715
|
+
elif _is_lib_present(lib_name="pandas"):
|
|
716
|
+
import pandas as pd
|
|
717
|
+
|
|
718
|
+
df = pd.read_csv(data_path)
|
|
719
|
+
df.to_parquet(tmp_file.name, index=False)
|
|
720
|
+
else:
|
|
721
|
+
raise ImportError(
|
|
722
|
+
"Either Polars or Pandas is required to create temporary Parquet files."
|
|
723
|
+
)
|
|
724
|
+
return tmp_file.name
|
|
725
|
+
|
|
726
|
+
elif file_type == "duckdb":
|
|
727
|
+
# Return path to DuckDB file
|
|
728
|
+
data_path = files("pointblank.data") / f"{dataset}-duckdb.zip"
|
|
729
|
+
|
|
730
|
+
# Extract DuckDB file to temporary location
|
|
731
|
+
with tempfile.NamedTemporaryFile(mode="wb", suffix=".ddb", delete=False) as tmp_file:
|
|
732
|
+
with ZipFile(data_path) as zip_file:
|
|
733
|
+
ddb_content = zip_file.read(f"{dataset}.ddb")
|
|
734
|
+
tmp_file.write(ddb_content)
|
|
735
|
+
return tmp_file.name
|
|
736
|
+
|
|
737
|
+
|
|
738
|
+
# =============================================================================
|
|
739
|
+
# Utility functions for processing input data (shared by preview() and Validate class)
|
|
740
|
+
# =============================================================================
|
|
741
|
+
|
|
742
|
+
|
|
743
|
+
def _process_connection_string(data: FrameT | Any) -> FrameT | Any:
|
|
744
|
+
"""
|
|
745
|
+
Process data parameter to handle database connection strings.
|
|
746
|
+
|
|
747
|
+
Uses the `connect_to_table()` utility function to handle URI-formatted connection strings with
|
|
748
|
+
table specifications. Returns the original data if it's not a connection string.
|
|
749
|
+
|
|
750
|
+
For more details on supported connection string formats, see the documentation
|
|
751
|
+
for `connect_to_table()`.
|
|
752
|
+
"""
|
|
753
|
+
# Check if data is a string that looks like a connection string
|
|
754
|
+
if not isinstance(data, str):
|
|
755
|
+
return data
|
|
756
|
+
|
|
757
|
+
# Basic connection string patterns
|
|
758
|
+
connection_patterns = [
|
|
759
|
+
"://", # General URL-like pattern
|
|
760
|
+
]
|
|
761
|
+
|
|
762
|
+
# Check if it looks like a connection string
|
|
763
|
+
if not any(pattern in data for pattern in connection_patterns):
|
|
764
|
+
return data
|
|
765
|
+
|
|
766
|
+
# Use the utility function to connect to the table
|
|
767
|
+
return connect_to_table(data)
|
|
768
|
+
|
|
769
|
+
|
|
770
|
+
def _process_csv_input(data: FrameT | Any) -> FrameT | Any:
|
|
771
|
+
"""
|
|
772
|
+
Process data parameter to handle CSV file inputs.
|
|
773
|
+
|
|
774
|
+
If data is a string or Path with .csv extension, reads the CSV file
|
|
775
|
+
using available libraries (Polars preferred, then Pandas).
|
|
776
|
+
|
|
777
|
+
Returns the original data if it's not a CSV file path.
|
|
778
|
+
"""
|
|
779
|
+
from pathlib import Path
|
|
780
|
+
|
|
781
|
+
# Check if data is a string or Path-like object with .csv extension
|
|
782
|
+
csv_path = None
|
|
783
|
+
|
|
784
|
+
if isinstance(data, (str, Path)):
|
|
785
|
+
path_obj = Path(data)
|
|
786
|
+
if path_obj.suffix.lower() == ".csv":
|
|
787
|
+
csv_path = path_obj
|
|
788
|
+
|
|
789
|
+
# If it's not a CSV file path, return the original data
|
|
790
|
+
if csv_path is None:
|
|
791
|
+
return data
|
|
792
|
+
|
|
793
|
+
# Check if the CSV file exists
|
|
794
|
+
if not csv_path.exists():
|
|
795
|
+
raise FileNotFoundError(f"CSV file not found: {csv_path}")
|
|
796
|
+
|
|
797
|
+
# Determine which library to use for reading CSV
|
|
798
|
+
# Prefer Polars, fallback to Pandas
|
|
799
|
+
if _is_lib_present(lib_name="polars"):
|
|
800
|
+
try:
|
|
801
|
+
import polars as pl
|
|
802
|
+
|
|
803
|
+
return pl.read_csv(csv_path, try_parse_dates=True)
|
|
804
|
+
except Exception as e:
|
|
805
|
+
# If Polars fails, try Pandas if available
|
|
806
|
+
if _is_lib_present(lib_name="pandas"):
|
|
807
|
+
import pandas as pd
|
|
808
|
+
|
|
809
|
+
return pd.read_csv(csv_path)
|
|
810
|
+
else:
|
|
811
|
+
raise RuntimeError(
|
|
812
|
+
f"Failed to read CSV file with Polars: {e}. "
|
|
813
|
+
"Pandas is not available as fallback."
|
|
814
|
+
) from e
|
|
815
|
+
elif _is_lib_present(lib_name="pandas"):
|
|
816
|
+
try:
|
|
817
|
+
import pandas as pd
|
|
818
|
+
|
|
819
|
+
return pd.read_csv(csv_path)
|
|
820
|
+
except Exception as e:
|
|
821
|
+
raise RuntimeError(f"Failed to read CSV file with Pandas: {e}") from e
|
|
822
|
+
else:
|
|
823
|
+
raise ImportError(
|
|
824
|
+
"Neither Polars nor Pandas is available for reading CSV files. "
|
|
825
|
+
"Please install either 'polars' or 'pandas' to use CSV file inputs."
|
|
826
|
+
)
|
|
827
|
+
|
|
828
|
+
|
|
829
|
+
def _process_parquet_input(data: FrameT | Any) -> FrameT | Any:
|
|
830
|
+
"""
|
|
831
|
+
Process data parameter to handle Parquet file inputs.
|
|
832
|
+
|
|
833
|
+
Supports:
|
|
834
|
+
- single .parquet file (string or Path)
|
|
835
|
+
- glob patterns for multiple .parquet files (e.g., "data/*.parquet")
|
|
836
|
+
- directory containing .parquet files
|
|
837
|
+
- partitioned Parquet datasets with automatic partition column inference
|
|
838
|
+
- list/sequence of .parquet file paths
|
|
839
|
+
|
|
840
|
+
Returns the original data if it's not a Parquet file input.
|
|
841
|
+
"""
|
|
842
|
+
import glob
|
|
843
|
+
from pathlib import Path
|
|
844
|
+
|
|
845
|
+
parquet_paths = []
|
|
846
|
+
|
|
847
|
+
# Handle different input types
|
|
848
|
+
if isinstance(data, (str, Path)):
|
|
849
|
+
data_str = str(data)
|
|
850
|
+
path_obj = Path(data)
|
|
851
|
+
|
|
852
|
+
# Check if it's a glob pattern containing .parquet first; look for glob
|
|
853
|
+
# characters: `*`, `?`, `[`, `]`
|
|
854
|
+
if ".parquet" in data_str.lower() and any(
|
|
855
|
+
char in data_str for char in ["*", "?", "[", "]"]
|
|
856
|
+
):
|
|
857
|
+
parquet_files = glob.glob(data_str)
|
|
858
|
+
if parquet_files:
|
|
859
|
+
parquet_paths = sorted([Path(f) for f in parquet_files])
|
|
860
|
+
else:
|
|
861
|
+
raise FileNotFoundError(f"No files found matching pattern: {data}")
|
|
862
|
+
|
|
863
|
+
# Check if it's a single .parquet file
|
|
864
|
+
elif path_obj.suffix.lower() == ".parquet":
|
|
865
|
+
if path_obj.exists():
|
|
866
|
+
parquet_paths = [path_obj]
|
|
867
|
+
else:
|
|
868
|
+
raise FileNotFoundError(f"Parquet file not found: {path_obj}")
|
|
869
|
+
|
|
870
|
+
# Check if it's a directory
|
|
871
|
+
elif path_obj.is_dir():
|
|
872
|
+
# First, try to read as a partitioned parquet dataset; This handles datasets where
|
|
873
|
+
# Parquet files are in subdirectories with partition columns encoded in paths
|
|
874
|
+
try:
|
|
875
|
+
# Both Polars and Pandas can handle partitioned datasets natively
|
|
876
|
+
if _is_lib_present(lib_name="polars"):
|
|
877
|
+
import polars as pl
|
|
878
|
+
|
|
879
|
+
# Try reading as partitioned dataset first
|
|
880
|
+
df = pl.read_parquet(str(path_obj))
|
|
881
|
+
return df
|
|
882
|
+
elif _is_lib_present(lib_name="pandas"):
|
|
883
|
+
import pandas as pd
|
|
884
|
+
|
|
885
|
+
# Try reading as partitioned dataset first
|
|
886
|
+
df = pd.read_parquet(str(path_obj))
|
|
887
|
+
return df
|
|
888
|
+
except Exception:
|
|
889
|
+
# If partitioned read fails, fall back to simple directory scan
|
|
890
|
+
pass
|
|
891
|
+
|
|
892
|
+
# Fallback: Look for .parquet files directly in the directory
|
|
893
|
+
parquet_files = list(path_obj.glob("*.parquet"))
|
|
894
|
+
if parquet_files:
|
|
895
|
+
parquet_paths = sorted(parquet_files)
|
|
896
|
+
else:
|
|
897
|
+
raise FileNotFoundError(
|
|
898
|
+
f"No .parquet files found in directory: {path_obj}. "
|
|
899
|
+
f"This could be a non-partitioned directory without .parquet files, "
|
|
900
|
+
f"or a partitioned dataset that couldn't be read."
|
|
901
|
+
)
|
|
902
|
+
|
|
903
|
+
# If it's not a parquet file, directory, or glob pattern, return original data
|
|
904
|
+
else:
|
|
905
|
+
return data
|
|
906
|
+
|
|
907
|
+
# Handle list/sequence of paths
|
|
908
|
+
elif isinstance(data, (list, tuple)):
|
|
909
|
+
for item in data:
|
|
910
|
+
item_path = Path(item)
|
|
911
|
+
if item_path.suffix.lower() == ".parquet":
|
|
912
|
+
if item_path.exists():
|
|
913
|
+
parquet_paths.append(item_path)
|
|
914
|
+
else:
|
|
915
|
+
raise FileNotFoundError(f"Parquet file not found: {item_path}")
|
|
916
|
+
else:
|
|
917
|
+
# If any item is not a parquet file, return original data
|
|
918
|
+
return data
|
|
919
|
+
|
|
920
|
+
# If no parquet files found, return original data
|
|
921
|
+
if not parquet_paths:
|
|
922
|
+
return data
|
|
923
|
+
|
|
924
|
+
# Read the parquet file(s) using available libraries; prefer Polars, fallback to Pandas
|
|
925
|
+
if _is_lib_present(lib_name="polars"):
|
|
926
|
+
try:
|
|
927
|
+
import polars as pl
|
|
928
|
+
|
|
929
|
+
if len(parquet_paths) == 1:
|
|
930
|
+
# Single file
|
|
931
|
+
return pl.read_parquet(parquet_paths[0])
|
|
932
|
+
else:
|
|
933
|
+
# Multiple files: concatenate them
|
|
934
|
+
dfs = [pl.read_parquet(path) for path in parquet_paths]
|
|
935
|
+
return pl.concat(dfs, how="vertical_relaxed")
|
|
936
|
+
except Exception as e:
|
|
937
|
+
# If Polars fails, try Pandas if available
|
|
938
|
+
if _is_lib_present(lib_name="pandas"):
|
|
939
|
+
import pandas as pd
|
|
940
|
+
|
|
941
|
+
if len(parquet_paths) == 1:
|
|
942
|
+
return pd.read_parquet(parquet_paths[0])
|
|
943
|
+
else:
|
|
944
|
+
# Multiple files: concatenate them
|
|
945
|
+
dfs = [pd.read_parquet(path) for path in parquet_paths]
|
|
946
|
+
return pd.concat(dfs, ignore_index=True)
|
|
947
|
+
else:
|
|
948
|
+
raise RuntimeError(
|
|
949
|
+
f"Failed to read Parquet file(s) with Polars: {e}. "
|
|
950
|
+
"Pandas is not available as fallback."
|
|
951
|
+
) from e
|
|
952
|
+
elif _is_lib_present(lib_name="pandas"):
|
|
953
|
+
try:
|
|
954
|
+
import pandas as pd
|
|
955
|
+
|
|
956
|
+
if len(parquet_paths) == 1:
|
|
957
|
+
return pd.read_parquet(parquet_paths[0])
|
|
958
|
+
else:
|
|
959
|
+
# Multiple files: concatenate them
|
|
960
|
+
dfs = [pd.read_parquet(path) for path in parquet_paths]
|
|
961
|
+
return pd.concat(dfs, ignore_index=True)
|
|
962
|
+
except Exception as e:
|
|
963
|
+
raise RuntimeError(f"Failed to read Parquet file(s) with Pandas: {e}") from e
|
|
964
|
+
else:
|
|
965
|
+
raise ImportError(
|
|
966
|
+
"Neither Polars nor Pandas is available for reading Parquet files. "
|
|
967
|
+
"Please install either 'polars' or 'pandas' to use Parquet file inputs."
|
|
968
|
+
)
|
|
969
|
+
|
|
970
|
+
|
|
563
971
|
def preview(
|
|
564
972
|
data: FrameT | Any,
|
|
565
973
|
columns_subset: str | list[str] | Column | None = None,
|
|
@@ -590,8 +998,14 @@ def preview(
|
|
|
590
998
|
Parameters
|
|
591
999
|
----------
|
|
592
1000
|
data
|
|
593
|
-
The table to preview, which could be a DataFrame object
|
|
594
|
-
|
|
1001
|
+
The table to preview, which could be a DataFrame object, an Ibis table object, a CSV
|
|
1002
|
+
file path, a Parquet file path, or a database connection string. When providing a CSV or
|
|
1003
|
+
Parquet file path (as a string or `pathlib.Path` object), the file will be automatically
|
|
1004
|
+
loaded using an available DataFrame library (Polars or Pandas). Parquet input also supports
|
|
1005
|
+
glob patterns, directories containing .parquet files, and Spark-style partitioned datasets.
|
|
1006
|
+
Connection strings enable direct database access via Ibis with optional table specification
|
|
1007
|
+
using the `::table_name` suffix. Read the *Supported Input Table Types* section for details
|
|
1008
|
+
on the supported table types.
|
|
595
1009
|
columns_subset
|
|
596
1010
|
The columns to display in the table, by default `None` (all columns are shown). This can
|
|
597
1011
|
be a string, a list of strings, a `Column` object, or a `ColumnSelector` object. The latter
|
|
@@ -636,13 +1050,40 @@ def preview(
|
|
|
636
1050
|
- MySQL table (`"mysql"`)*
|
|
637
1051
|
- PostgreSQL table (`"postgresql"`)*
|
|
638
1052
|
- SQLite table (`"sqlite"`)*
|
|
1053
|
+
- Microsoft SQL Server table (`"mssql"`)*
|
|
1054
|
+
- Snowflake table (`"snowflake"`)*
|
|
1055
|
+
- Databricks table (`"databricks"`)*
|
|
1056
|
+
- PySpark table (`"pyspark"`)*
|
|
1057
|
+
- BigQuery table (`"bigquery"`)*
|
|
639
1058
|
- Parquet table (`"parquet"`)*
|
|
1059
|
+
- CSV files (string path or `pathlib.Path` object with `.csv` extension)
|
|
1060
|
+
- Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
|
|
1061
|
+
extension, or partitioned dataset)
|
|
1062
|
+
- Database connection strings (URI format with optional table specification)
|
|
640
1063
|
|
|
641
1064
|
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
|
|
642
1065
|
`ibis.expr.types.relations.Table`). Furthermore, using `preview()` with these types of tables
|
|
643
1066
|
requires the Ibis library (`v9.5.0` or above) to be installed. If the input table is a Polars or
|
|
644
1067
|
Pandas DataFrame, the availability of Ibis is not needed.
|
|
645
1068
|
|
|
1069
|
+
To use a CSV file, ensure that a string or `pathlib.Path` object with a `.csv` extension is
|
|
1070
|
+
provided. The file will be automatically detected and loaded using the best available DataFrame
|
|
1071
|
+
library. The loading preference is Polars first, then Pandas as a fallback.
|
|
1072
|
+
|
|
1073
|
+
Connection strings follow database URL formats and must also specify a table using the
|
|
1074
|
+
`::table_name` suffix. Examples include:
|
|
1075
|
+
|
|
1076
|
+
```
|
|
1077
|
+
"duckdb:///path/to/database.ddb::table_name"
|
|
1078
|
+
"sqlite:///path/to/database.db::table_name"
|
|
1079
|
+
"postgresql://user:password@localhost:5432/database::table_name"
|
|
1080
|
+
"mysql://user:password@localhost:3306/database::table_name"
|
|
1081
|
+
"bigquery://project/dataset::table_name"
|
|
1082
|
+
"snowflake://user:password@account/database/schema::table_name"
|
|
1083
|
+
```
|
|
1084
|
+
|
|
1085
|
+
When using connection strings, the Ibis library with the appropriate backend driver is required.
|
|
1086
|
+
|
|
646
1087
|
Examples
|
|
647
1088
|
--------
|
|
648
1089
|
It's easy to preview a table using the `preview()` function. Here's an example using the
|
|
@@ -709,8 +1150,80 @@ def preview(
|
|
|
709
1150
|
columns_subset=pb.col(pb.starts_with("item") | pb.matches("player"))
|
|
710
1151
|
)
|
|
711
1152
|
```
|
|
1153
|
+
|
|
1154
|
+
### Working with CSV Files
|
|
1155
|
+
|
|
1156
|
+
The `preview()` function can directly accept CSV file paths, making it easy to preview data
|
|
1157
|
+
stored in CSV files without manual loading:
|
|
1158
|
+
|
|
1159
|
+
```{python}
|
|
1160
|
+
# Get a path to a CSV file from the package data
|
|
1161
|
+
csv_path = pb.get_data_path("global_sales", "csv")
|
|
1162
|
+
|
|
1163
|
+
pb.preview(csv_path)
|
|
1164
|
+
```
|
|
1165
|
+
|
|
1166
|
+
You can also use a Path object to specify the CSV file:
|
|
1167
|
+
|
|
1168
|
+
```{python}
|
|
1169
|
+
from pathlib import Path
|
|
1170
|
+
|
|
1171
|
+
csv_file = Path(pb.get_data_path("game_revenue", "csv"))
|
|
1172
|
+
|
|
1173
|
+
pb.preview(csv_file, n_head=3, n_tail=3)
|
|
1174
|
+
```
|
|
1175
|
+
|
|
1176
|
+
### Working with Parquet Files
|
|
1177
|
+
|
|
1178
|
+
The `preview()` function can directly accept Parquet files and datasets in various formats:
|
|
1179
|
+
|
|
1180
|
+
```{python}
|
|
1181
|
+
# Single Parquet file from package data
|
|
1182
|
+
parquet_path = pb.get_data_path("nycflights", "parquet")
|
|
1183
|
+
|
|
1184
|
+
pb.preview(parquet_path)
|
|
1185
|
+
```
|
|
1186
|
+
|
|
1187
|
+
You can also use glob patterns and directories:
|
|
1188
|
+
|
|
1189
|
+
```python
|
|
1190
|
+
# Multiple Parquet files with glob patterns
|
|
1191
|
+
pb.preview("data/sales_*.parquet")
|
|
1192
|
+
|
|
1193
|
+
# Directory containing Parquet files
|
|
1194
|
+
pb.preview("parquet_data/")
|
|
1195
|
+
|
|
1196
|
+
# Partitioned Parquet dataset
|
|
1197
|
+
pb.preview("sales_data/") # Auto-discovers partition columns
|
|
1198
|
+
```
|
|
1199
|
+
|
|
1200
|
+
### Working with Database Connection Strings
|
|
1201
|
+
|
|
1202
|
+
The `preview()` function supports database connection strings for direct preview of database
|
|
1203
|
+
tables. Connection strings must specify a table using the `::table_name` suffix:
|
|
1204
|
+
|
|
1205
|
+
```{python}
|
|
1206
|
+
# Get path to a DuckDB database file from package data
|
|
1207
|
+
duckdb_path = pb.get_data_path("game_revenue", "duckdb")
|
|
1208
|
+
|
|
1209
|
+
pb.preview(f"duckdb:///{duckdb_path}::game_revenue")
|
|
1210
|
+
```
|
|
1211
|
+
|
|
1212
|
+
For comprehensive documentation on supported connection string formats, error handling, and
|
|
1213
|
+
installation requirements, see the [`connect_to_table()`](`pointblank.connect_to_table`)
|
|
1214
|
+
function.
|
|
712
1215
|
"""
|
|
713
1216
|
|
|
1217
|
+
# Process input data to handle different data source types
|
|
1218
|
+
# Handle connection string input (e.g., "duckdb:///path/to/file.ddb::table_name")
|
|
1219
|
+
data = _process_connection_string(data)
|
|
1220
|
+
|
|
1221
|
+
# Handle CSV file input (e.g., "data.csv" or Path("data.csv"))
|
|
1222
|
+
data = _process_csv_input(data)
|
|
1223
|
+
|
|
1224
|
+
# Handle Parquet file input (e.g., "data.parquet", "data/*.parquet", "data/")
|
|
1225
|
+
data = _process_parquet_input(data)
|
|
1226
|
+
|
|
714
1227
|
if incl_header is None:
|
|
715
1228
|
incl_header = global_config.preview_incl_header
|
|
716
1229
|
|
|
@@ -908,7 +1421,7 @@ def _generate_display_table(
|
|
|
908
1421
|
k: v.split("(")[0] if "(" in v else v for k, v in col_dtype_dict.items()
|
|
909
1422
|
}
|
|
910
1423
|
|
|
911
|
-
# Create a dictionary of column and row positions where the value is None/NA/
|
|
1424
|
+
# Create a dictionary of column and row positions where the value is None/NA/Null
|
|
912
1425
|
# This is used to highlight these values in the table
|
|
913
1426
|
if df_lib_name_gt == "polars":
|
|
914
1427
|
none_values = {k: data[k].is_null().to_list() for k in col_names}
|
|
@@ -932,7 +1445,10 @@ def _generate_display_table(
|
|
|
932
1445
|
column_values = gt.gt._get_column_of_values(built_gt, column_name=column, context="html")
|
|
933
1446
|
|
|
934
1447
|
# Get the maximum number of characters in the column
|
|
935
|
-
|
|
1448
|
+
if column_values: # Check if column_values is not empty
|
|
1449
|
+
max_length_col_vals.append(max([len(str(val)) for val in column_values]))
|
|
1450
|
+
else:
|
|
1451
|
+
max_length_col_vals.append(0) # Use 0 for empty columns
|
|
936
1452
|
|
|
937
1453
|
length_col_names = [len(column) for column in col_dtype_dict.keys()]
|
|
938
1454
|
length_data_types = [len(dtype) for dtype in col_dtype_dict_short.values()]
|
|
@@ -1003,8 +1519,12 @@ def _generate_display_table(
|
|
|
1003
1519
|
|
|
1004
1520
|
# Get the highest number in the `row_number_list` and calculate a width that will
|
|
1005
1521
|
# safely fit a number of that magnitude
|
|
1006
|
-
|
|
1007
|
-
|
|
1522
|
+
if row_number_list: # Check if list is not empty
|
|
1523
|
+
max_row_num = max(row_number_list)
|
|
1524
|
+
max_row_num_width = len(str(max_row_num)) * 7.8 + 10
|
|
1525
|
+
else:
|
|
1526
|
+
# If row_number_list is empty, use a default width
|
|
1527
|
+
max_row_num_width = 7.8 * 2 + 10 # Width for 2-digit numbers
|
|
1008
1528
|
|
|
1009
1529
|
# Update the col_width_dict to include the row number column
|
|
1010
1530
|
col_width_dict = {"_row_num_": f"{max_row_num_width}px"} | col_width_dict
|
|
@@ -1134,6 +1654,11 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
|
|
|
1134
1654
|
- MySQL table (`"mysql"`)*
|
|
1135
1655
|
- PostgreSQL table (`"postgresql"`)*
|
|
1136
1656
|
- SQLite table (`"sqlite"`)*
|
|
1657
|
+
- Microsoft SQL Server table (`"mssql"`)*
|
|
1658
|
+
- Snowflake table (`"snowflake"`)*
|
|
1659
|
+
- Databricks table (`"databricks"`)*
|
|
1660
|
+
- PySpark table (`"pyspark"`)*
|
|
1661
|
+
- BigQuery table (`"bigquery"`)*
|
|
1137
1662
|
- Parquet table (`"parquet"`)*
|
|
1138
1663
|
|
|
1139
1664
|
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
|
|
@@ -1663,6 +2188,11 @@ def get_column_count(data: FrameT | Any) -> int:
|
|
|
1663
2188
|
- MySQL table (`"mysql"`)*
|
|
1664
2189
|
- PostgreSQL table (`"postgresql"`)*
|
|
1665
2190
|
- SQLite table (`"sqlite"`)*
|
|
2191
|
+
- Microsoft SQL Server table (`"mssql"`)*
|
|
2192
|
+
- Snowflake table (`"snowflake"`)*
|
|
2193
|
+
- Databricks table (`"databricks"`)*
|
|
2194
|
+
- PySpark table (`"pyspark"`)*
|
|
2195
|
+
- BigQuery table (`"bigquery"`)*
|
|
1666
2196
|
- Parquet table (`"parquet"`)*
|
|
1667
2197
|
|
|
1668
2198
|
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
|
|
@@ -1707,6 +2237,9 @@ def get_column_count(data: FrameT | Any) -> int:
|
|
|
1707
2237
|
elif "pandas" in str(type(data)):
|
|
1708
2238
|
return data.shape[1]
|
|
1709
2239
|
|
|
2240
|
+
elif "narwhals" in str(type(data)):
|
|
2241
|
+
return len(data.columns)
|
|
2242
|
+
|
|
1710
2243
|
else:
|
|
1711
2244
|
raise ValueError("The input table type supplied in `data=` is not supported.")
|
|
1712
2245
|
|
|
@@ -1741,6 +2274,11 @@ def get_row_count(data: FrameT | Any) -> int:
|
|
|
1741
2274
|
- MySQL table (`"mysql"`)*
|
|
1742
2275
|
- PostgreSQL table (`"postgresql"`)*
|
|
1743
2276
|
- SQLite table (`"sqlite"`)*
|
|
2277
|
+
- Microsoft SQL Server table (`"mssql"`)*
|
|
2278
|
+
- Snowflake table (`"snowflake"`)*
|
|
2279
|
+
- Databricks table (`"databricks"`)*
|
|
2280
|
+
- PySpark table (`"pyspark"`)*
|
|
2281
|
+
- BigQuery table (`"bigquery"`)*
|
|
1744
2282
|
- Parquet table (`"parquet"`)*
|
|
1745
2283
|
|
|
1746
2284
|
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
|
|
@@ -1795,6 +2333,9 @@ def get_row_count(data: FrameT | Any) -> int:
|
|
|
1795
2333
|
elif "pandas" in str(type(data)):
|
|
1796
2334
|
return data.shape[0]
|
|
1797
2335
|
|
|
2336
|
+
elif "narwhals" in str(type(data)):
|
|
2337
|
+
return data.shape[0]
|
|
2338
|
+
|
|
1798
2339
|
else:
|
|
1799
2340
|
raise ValueError("The input table type supplied in `data=` is not supported.")
|
|
1800
2341
|
|
|
@@ -1910,6 +2451,239 @@ class _ValidationInfo:
|
|
|
1910
2451
|
return self.val_info
|
|
1911
2452
|
|
|
1912
2453
|
|
|
2454
|
+
def connect_to_table(connection_string: str) -> Any:
|
|
2455
|
+
"""
|
|
2456
|
+
Connect to a database table using a connection string.
|
|
2457
|
+
|
|
2458
|
+
This utility function tests whether a connection string leads to a valid table and returns
|
|
2459
|
+
the table object if successful. It provides helpful error messages when no table is specified
|
|
2460
|
+
or when backend dependencies are missing.
|
|
2461
|
+
|
|
2462
|
+
Parameters
|
|
2463
|
+
----------
|
|
2464
|
+
connection_string
|
|
2465
|
+
A database connection string with a required table specification using the `::table_name`
|
|
2466
|
+
suffix. Supported formats are outlined in the *Supported Connection String Formats* section.
|
|
2467
|
+
|
|
2468
|
+
Returns
|
|
2469
|
+
-------
|
|
2470
|
+
Any
|
|
2471
|
+
An Ibis table object for the specified database table.
|
|
2472
|
+
|
|
2473
|
+
Supported Connection String Formats
|
|
2474
|
+
-----------------------------------
|
|
2475
|
+
The `connection_string` parameter must include a valid connection string with a table name
|
|
2476
|
+
specified using the `::` syntax. Here are some examples on how to format connection strings
|
|
2477
|
+
for various backends:
|
|
2478
|
+
|
|
2479
|
+
```
|
|
2480
|
+
DuckDB: "duckdb:///path/to/database.ddb::table_name"
|
|
2481
|
+
SQLite: "sqlite:///path/to/database.db::table_name"
|
|
2482
|
+
PostgreSQL: "postgresql://user:password@localhost:5432/database::table_name"
|
|
2483
|
+
MySQL: "mysql://user:password@localhost:3306/database::table_name"
|
|
2484
|
+
BigQuery: "bigquery://project/dataset::table_name"
|
|
2485
|
+
Snowflake: "snowflake://user:password@account/database/schema::table_name"
|
|
2486
|
+
```
|
|
2487
|
+
|
|
2488
|
+
If the connection string does not include a table name, the function will attempt to connect to
|
|
2489
|
+
the database and list available tables, providing guidance on how to specify a table.
|
|
2490
|
+
|
|
2491
|
+
Examples
|
|
2492
|
+
--------
|
|
2493
|
+
Connect to a DuckDB table:
|
|
2494
|
+
|
|
2495
|
+
```{python}
|
|
2496
|
+
import pointblank as pb
|
|
2497
|
+
|
|
2498
|
+
# Get path to a DuckDB database file from package data
|
|
2499
|
+
duckdb_path = pb.get_data_path("game_revenue", "duckdb")
|
|
2500
|
+
|
|
2501
|
+
# Connect to the `game_revenue` table in the DuckDB database
|
|
2502
|
+
game_revenue = pb.connect_to_table(f"duckdb:///{duckdb_path}::game_revenue")
|
|
2503
|
+
|
|
2504
|
+
# Use with the `preview()` function
|
|
2505
|
+
pb.preview(game_revenue)
|
|
2506
|
+
```
|
|
2507
|
+
|
|
2508
|
+
Here are some backend-specific connection examples:
|
|
2509
|
+
|
|
2510
|
+
```python
|
|
2511
|
+
# PostgreSQL
|
|
2512
|
+
pg_table = pb.connect_to_table(
|
|
2513
|
+
"postgresql://user:password@localhost:5432/warehouse::customer_data"
|
|
2514
|
+
)
|
|
2515
|
+
|
|
2516
|
+
# SQLite
|
|
2517
|
+
sqlite_table = pb.connect_to_table("sqlite:///local_data.db::products")
|
|
2518
|
+
|
|
2519
|
+
# BigQuery
|
|
2520
|
+
bq_table = pb.connect_to_table("bigquery://my-project/analytics::daily_metrics")
|
|
2521
|
+
```
|
|
2522
|
+
|
|
2523
|
+
This function requires the Ibis library with appropriate backend drivers:
|
|
2524
|
+
|
|
2525
|
+
```bash
|
|
2526
|
+
# You can install a set of common backends:
|
|
2527
|
+
pip install 'ibis-framework[duckdb,postgres,mysql,sqlite]'
|
|
2528
|
+
|
|
2529
|
+
# ...or specific backends as needed:
|
|
2530
|
+
pip install 'ibis-framework[duckdb]' # for DuckDB
|
|
2531
|
+
pip install 'ibis-framework[postgres]' # for PostgreSQL
|
|
2532
|
+
```
|
|
2533
|
+
"""
|
|
2534
|
+
# Check if Ibis is available
|
|
2535
|
+
if not _is_lib_present(lib_name="ibis"):
|
|
2536
|
+
raise ImportError(
|
|
2537
|
+
"The Ibis library is not installed but is required for database connection strings.\n"
|
|
2538
|
+
"Install it with: pip install 'ibis-framework[duckdb]' (or other backend as needed)"
|
|
2539
|
+
)
|
|
2540
|
+
|
|
2541
|
+
import ibis
|
|
2542
|
+
|
|
2543
|
+
# Check if connection string includes table specification
|
|
2544
|
+
if "::" not in connection_string:
|
|
2545
|
+
# Try to connect to get available tables for helpful error message
|
|
2546
|
+
try:
|
|
2547
|
+
# Extract the base connection string (without table name)
|
|
2548
|
+
base_connection = connection_string
|
|
2549
|
+
|
|
2550
|
+
# Connect to the database
|
|
2551
|
+
conn = ibis.connect(base_connection)
|
|
2552
|
+
|
|
2553
|
+
# Get list of available tables
|
|
2554
|
+
try:
|
|
2555
|
+
available_tables = conn.list_tables()
|
|
2556
|
+
except Exception:
|
|
2557
|
+
available_tables = []
|
|
2558
|
+
|
|
2559
|
+
conn.disconnect()
|
|
2560
|
+
|
|
2561
|
+
# Create helpful error message
|
|
2562
|
+
if available_tables:
|
|
2563
|
+
table_list = "\n".join(f" - {table}" for table in available_tables)
|
|
2564
|
+
error_msg = (
|
|
2565
|
+
f"No table specified in connection string: {connection_string}\n\n"
|
|
2566
|
+
f"Available tables in the database:\n{table_list}\n\n"
|
|
2567
|
+
f"To access a specific table, use the format:\n"
|
|
2568
|
+
f" {connection_string}::TABLE_NAME\n\n"
|
|
2569
|
+
f"Examples:\n"
|
|
2570
|
+
)
|
|
2571
|
+
# Add examples with first few table names
|
|
2572
|
+
for table in available_tables[:3]:
|
|
2573
|
+
error_msg += f" {connection_string}::{table}\n"
|
|
2574
|
+
else:
|
|
2575
|
+
error_msg = (
|
|
2576
|
+
f"No table specified in connection string: {connection_string}\n\n"
|
|
2577
|
+
f"No tables found in the database or unable to list tables.\n\n"
|
|
2578
|
+
f"To access a specific table, use the format:\n"
|
|
2579
|
+
f" {connection_string}::TABLE_NAME"
|
|
2580
|
+
)
|
|
2581
|
+
|
|
2582
|
+
raise ValueError(error_msg)
|
|
2583
|
+
|
|
2584
|
+
except Exception as e:
|
|
2585
|
+
if isinstance(e, ValueError):
|
|
2586
|
+
raise # Re-raise our custom ValueError
|
|
2587
|
+
|
|
2588
|
+
# Check for backend-specific errors and provide installation guidance
|
|
2589
|
+
error_str = str(e).lower()
|
|
2590
|
+
backend_install_map = {
|
|
2591
|
+
"duckdb": "pip install 'ibis-framework[duckdb]'",
|
|
2592
|
+
"postgresql": "pip install 'ibis-framework[postgres]'",
|
|
2593
|
+
"postgres": "pip install 'ibis-framework[postgres]'",
|
|
2594
|
+
"mysql": "pip install 'ibis-framework[mysql]'",
|
|
2595
|
+
"sqlite": "pip install 'ibis-framework[sqlite]'",
|
|
2596
|
+
"bigquery": "pip install 'ibis-framework[bigquery]'",
|
|
2597
|
+
"snowflake": "pip install 'ibis-framework[snowflake]'",
|
|
2598
|
+
}
|
|
2599
|
+
|
|
2600
|
+
# Check if this is a missing backend dependency
|
|
2601
|
+
for backend, install_cmd in backend_install_map.items():
|
|
2602
|
+
if backend in error_str and ("not found" in error_str or "no module" in error_str):
|
|
2603
|
+
raise ConnectionError(
|
|
2604
|
+
f"Missing {backend.upper()} backend for Ibis. Install it with:\n"
|
|
2605
|
+
f" {install_cmd}\n\n"
|
|
2606
|
+
f"Original error: {e}\n\n"
|
|
2607
|
+
f"Supported connection string formats:\n"
|
|
2608
|
+
f"- DuckDB: 'duckdb:///path/to/file.ddb::table_name'\n"
|
|
2609
|
+
f"- SQLite: 'sqlite:///path/to/file.db::table_name'\n"
|
|
2610
|
+
f"- PostgreSQL: 'postgresql://user:pass@host:port/db::table_name'\n"
|
|
2611
|
+
f"- MySQL: 'mysql://user:pass@host:port/db::table_name'\n"
|
|
2612
|
+
f"- BigQuery: 'bigquery://project/dataset::table_name'\n"
|
|
2613
|
+
f"- Snowflake: 'snowflake://user:pass@account/db/schema::table_name'\n"
|
|
2614
|
+
f"\nNote: Use '::table_name' to specify the table within the database."
|
|
2615
|
+
) from e
|
|
2616
|
+
|
|
2617
|
+
# Generic connection error
|
|
2618
|
+
raise ConnectionError(
|
|
2619
|
+
f"Failed to connect to database using connection string: {connection_string}\n"
|
|
2620
|
+
f"Error: {e}\n\n"
|
|
2621
|
+
f"No table specified. Use the format: {connection_string}::TABLE_NAME"
|
|
2622
|
+
) from e
|
|
2623
|
+
|
|
2624
|
+
# Split connection string and table name
|
|
2625
|
+
try:
|
|
2626
|
+
base_connection, table_name = connection_string.rsplit("::", 1)
|
|
2627
|
+
except ValueError:
|
|
2628
|
+
raise ValueError(f"Invalid connection string format: {connection_string}")
|
|
2629
|
+
|
|
2630
|
+
# Connect to database and get table
|
|
2631
|
+
try:
|
|
2632
|
+
conn = ibis.connect(base_connection)
|
|
2633
|
+
table = conn.table(table_name)
|
|
2634
|
+
return table
|
|
2635
|
+
|
|
2636
|
+
except Exception as e:
|
|
2637
|
+
# Check for backend-specific errors and provide installation guidance
|
|
2638
|
+
error_str = str(e).lower()
|
|
2639
|
+
backend_install_map = {
|
|
2640
|
+
"duckdb": "pip install 'ibis-framework[duckdb]'",
|
|
2641
|
+
"postgresql": "pip install 'ibis-framework[postgres]'",
|
|
2642
|
+
"postgres": "pip install 'ibis-framework[postgres]'",
|
|
2643
|
+
"mysql": "pip install 'ibis-framework[mysql]'",
|
|
2644
|
+
"sqlite": "pip install 'ibis-framework[sqlite]'",
|
|
2645
|
+
"bigquery": "pip install 'ibis-framework[bigquery]'",
|
|
2646
|
+
"snowflake": "pip install 'ibis-framework[snowflake]'",
|
|
2647
|
+
}
|
|
2648
|
+
|
|
2649
|
+
# Check if this is a missing backend dependency
|
|
2650
|
+
for backend, install_cmd in backend_install_map.items():
|
|
2651
|
+
if backend in error_str and ("not found" in error_str or "no module" in error_str):
|
|
2652
|
+
raise ConnectionError(
|
|
2653
|
+
f"Missing {backend.upper()} backend for Ibis. Install it with:\n"
|
|
2654
|
+
f" {install_cmd}\n\n"
|
|
2655
|
+
f"Original error: {e}"
|
|
2656
|
+
) from e
|
|
2657
|
+
|
|
2658
|
+
# Check if table doesn't exist
|
|
2659
|
+
if "table" in error_str and ("not found" in error_str or "does not exist" in error_str):
|
|
2660
|
+
# Try to get available tables for helpful message
|
|
2661
|
+
try:
|
|
2662
|
+
available_tables = conn.list_tables()
|
|
2663
|
+
if available_tables:
|
|
2664
|
+
table_list = "\n".join(f" - {table}" for table in available_tables)
|
|
2665
|
+
raise ValueError(
|
|
2666
|
+
f"Table '{table_name}' not found in database.\n\n"
|
|
2667
|
+
f"Available tables:\n{table_list}\n\n"
|
|
2668
|
+
f"Check the table name and try again with:\n"
|
|
2669
|
+
f" {base_connection}::CORRECT_TABLE_NAME"
|
|
2670
|
+
) from e
|
|
2671
|
+
else:
|
|
2672
|
+
raise ValueError(
|
|
2673
|
+
f"Table '{table_name}' not found and no tables available in database."
|
|
2674
|
+
) from e
|
|
2675
|
+
except Exception:
|
|
2676
|
+
raise ValueError(
|
|
2677
|
+
f"Table '{table_name}' not found in database. "
|
|
2678
|
+
f"Check the table name and connection string."
|
|
2679
|
+
) from e
|
|
2680
|
+
|
|
2681
|
+
# Generic connection error
|
|
2682
|
+
raise ConnectionError(
|
|
2683
|
+
f"Failed to connect to table '{table_name}' using: {base_connection}\nError: {e}"
|
|
2684
|
+
) from e
|
|
2685
|
+
|
|
2686
|
+
|
|
1913
2687
|
@dataclass
|
|
1914
2688
|
class Validate:
|
|
1915
2689
|
"""
|
|
@@ -1942,8 +2716,14 @@ class Validate:
|
|
|
1942
2716
|
Parameters
|
|
1943
2717
|
----------
|
|
1944
2718
|
data
|
|
1945
|
-
The table to validate, which could be a DataFrame object
|
|
1946
|
-
|
|
2719
|
+
The table to validate, which could be a DataFrame object, an Ibis table object, a CSV
|
|
2720
|
+
file path, a Parquet file path, or a database connection string. When providing a CSV or
|
|
2721
|
+
Parquet file path (as a string or `pathlib.Path` object), the file will be automatically
|
|
2722
|
+
loaded using an available DataFrame library (Polars or Pandas). Parquet input also supports
|
|
2723
|
+
glob patterns, directories containing .parquet files, and Spark-style partitioned datasets.
|
|
2724
|
+
Connection strings enable direct database access via Ibis with optional table specification
|
|
2725
|
+
using the `::table_name` suffix. Read the *Supported Input Table Types* section for details
|
|
2726
|
+
on the supported table types.
|
|
1947
2727
|
tbl_name
|
|
1948
2728
|
An optional name to assign to the input table object. If no value is provided, a name will
|
|
1949
2729
|
be generated based on whatever information is available. This table name will be displayed
|
|
@@ -2007,13 +2787,40 @@ class Validate:
|
|
|
2007
2787
|
- MySQL table (`"mysql"`)*
|
|
2008
2788
|
- PostgreSQL table (`"postgresql"`)*
|
|
2009
2789
|
- SQLite table (`"sqlite"`)*
|
|
2790
|
+
- Microsoft SQL Server table (`"mssql"`)*
|
|
2791
|
+
- Snowflake table (`"snowflake"`)*
|
|
2792
|
+
- Databricks table (`"databricks"`)*
|
|
2793
|
+
- PySpark table (`"pyspark"`)*
|
|
2794
|
+
- BigQuery table (`"bigquery"`)*
|
|
2010
2795
|
- Parquet table (`"parquet"`)*
|
|
2796
|
+
- CSV files (string path or `pathlib.Path` object with `.csv` extension)
|
|
2797
|
+
- Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
|
|
2798
|
+
extension, or partitioned dataset)
|
|
2799
|
+
- Database connection strings (URI format with optional table specification)
|
|
2011
2800
|
|
|
2012
2801
|
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
|
|
2013
2802
|
`ibis.expr.types.relations.Table`). Furthermore, the use of `Validate` with such tables requires
|
|
2014
2803
|
the Ibis library v9.5.0 and above to be installed. If the input table is a Polars or Pandas
|
|
2015
2804
|
DataFrame, the Ibis library is not required.
|
|
2016
2805
|
|
|
2806
|
+
To use a CSV file, ensure that a string or `pathlib.Path` object with a `.csv` extension is
|
|
2807
|
+
provided. The file will be automatically detected and loaded using the best available DataFrame
|
|
2808
|
+
library. The loading preference is Polars first, then Pandas as a fallback.
|
|
2809
|
+
|
|
2810
|
+
Connection strings follow database URL formats and must also specify a table using the
|
|
2811
|
+
`::table_name` suffix. Examples include:
|
|
2812
|
+
|
|
2813
|
+
```
|
|
2814
|
+
"duckdb:///path/to/database.ddb::table_name"
|
|
2815
|
+
"sqlite:///path/to/database.db::table_name"
|
|
2816
|
+
"postgresql://user:password@localhost:5432/database::table_name"
|
|
2817
|
+
"mysql://user:password@localhost:3306/database::table_name"
|
|
2818
|
+
"bigquery://project/dataset::table_name"
|
|
2819
|
+
"snowflake://user:password@account/database/schema::table_name"
|
|
2820
|
+
```
|
|
2821
|
+
|
|
2822
|
+
When using connection strings, the Ibis library with the appropriate backend driver is required.
|
|
2823
|
+
|
|
2017
2824
|
Thresholds
|
|
2018
2825
|
----------
|
|
2019
2826
|
The `thresholds=` parameter is used to set the failure-condition levels for all validation
|
|
@@ -2170,8 +2977,8 @@ class Validate:
|
|
|
2170
2977
|
```{python}
|
|
2171
2978
|
import pointblank as pb
|
|
2172
2979
|
|
|
2173
|
-
# Load the small_table dataset
|
|
2174
|
-
small_table = pb.load_dataset()
|
|
2980
|
+
# Load the `small_table` dataset
|
|
2981
|
+
small_table = pb.load_dataset(dataset="small_table", tbl_type="polars")
|
|
2175
2982
|
|
|
2176
2983
|
# Preview the table
|
|
2177
2984
|
pb.preview(small_table)
|
|
@@ -2237,7 +3044,7 @@ class Validate:
|
|
|
2237
3044
|
brief). Here's an example of a global setting for briefs:
|
|
2238
3045
|
|
|
2239
3046
|
```{python}
|
|
2240
|
-
|
|
3047
|
+
validation_2 = (
|
|
2241
3048
|
pb.Validate(
|
|
2242
3049
|
data=pb.load_dataset(),
|
|
2243
3050
|
tbl_name="small_table",
|
|
@@ -2254,7 +3061,7 @@ class Validate:
|
|
|
2254
3061
|
.interrogate()
|
|
2255
3062
|
)
|
|
2256
3063
|
|
|
2257
|
-
|
|
3064
|
+
validation_2
|
|
2258
3065
|
```
|
|
2259
3066
|
|
|
2260
3067
|
We see the text of the briefs appear in the `STEP` column of the reporting table. Furthermore,
|
|
@@ -2272,7 +3079,7 @@ class Validate:
|
|
|
2272
3079
|
the data extracts for each validation step.
|
|
2273
3080
|
|
|
2274
3081
|
```{python}
|
|
2275
|
-
|
|
3082
|
+
validation_2.get_data_extracts()
|
|
2276
3083
|
```
|
|
2277
3084
|
|
|
2278
3085
|
We can also view step reports for each validation step using the
|
|
@@ -2280,7 +3087,7 @@ class Validate:
|
|
|
2280
3087
|
type of validation step and shows the relevant information for a step's validation.
|
|
2281
3088
|
|
|
2282
3089
|
```{python}
|
|
2283
|
-
|
|
3090
|
+
validation_2.get_step_report(i=2)
|
|
2284
3091
|
```
|
|
2285
3092
|
|
|
2286
3093
|
The `Validate` class also has a method for getting the sundered data, which is the data that
|
|
@@ -2288,11 +3095,141 @@ class Validate:
|
|
|
2288
3095
|
[`get_sundered_data()`](`pointblank.Validate.get_sundered_data`) method.
|
|
2289
3096
|
|
|
2290
3097
|
```{python}
|
|
2291
|
-
pb.preview(
|
|
3098
|
+
pb.preview(validation_2.get_sundered_data())
|
|
2292
3099
|
```
|
|
2293
3100
|
|
|
2294
3101
|
The sundered data is a DataFrame that contains the rows that passed or failed the validation.
|
|
2295
3102
|
The default behavior is to return the rows that failed the validation, as shown above.
|
|
3103
|
+
|
|
3104
|
+
### Working with CSV Files
|
|
3105
|
+
|
|
3106
|
+
The `Validate` class can directly accept CSV file paths, making it easy to validate data stored
|
|
3107
|
+
in CSV files without manual loading:
|
|
3108
|
+
|
|
3109
|
+
```{python}
|
|
3110
|
+
# Get a path to a CSV file from the package data
|
|
3111
|
+
csv_path = pb.get_data_path("global_sales", "csv")
|
|
3112
|
+
|
|
3113
|
+
validation_3 = (
|
|
3114
|
+
pb.Validate(
|
|
3115
|
+
data=csv_path,
|
|
3116
|
+
label="CSV validation example"
|
|
3117
|
+
)
|
|
3118
|
+
.col_exists(["customer_id", "product_id", "revenue"])
|
|
3119
|
+
.col_vals_not_null(["customer_id", "product_id"])
|
|
3120
|
+
.col_vals_gt(columns="revenue", value=0)
|
|
3121
|
+
.interrogate()
|
|
3122
|
+
)
|
|
3123
|
+
|
|
3124
|
+
validation_3
|
|
3125
|
+
```
|
|
3126
|
+
|
|
3127
|
+
You can also use a Path object to specify the CSV file. Here's an example of how to do that:
|
|
3128
|
+
|
|
3129
|
+
```{python}
|
|
3130
|
+
from pathlib import Path
|
|
3131
|
+
|
|
3132
|
+
csv_file = Path(pb.get_data_path("game_revenue", "csv"))
|
|
3133
|
+
|
|
3134
|
+
validation_4 = (
|
|
3135
|
+
pb.Validate(data=csv_file, label="Game Revenue Validation")
|
|
3136
|
+
.col_exists(["player_id", "session_id", "item_name"])
|
|
3137
|
+
.col_vals_regex(
|
|
3138
|
+
columns="session_id",
|
|
3139
|
+
pattern=r"[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}"
|
|
3140
|
+
)
|
|
3141
|
+
.col_vals_gt(columns="item_revenue", value=0, na_pass=True)
|
|
3142
|
+
.interrogate()
|
|
3143
|
+
)
|
|
3144
|
+
|
|
3145
|
+
validation_4
|
|
3146
|
+
```
|
|
3147
|
+
|
|
3148
|
+
The CSV loading is automatic, so when a string or Path with a `.csv` extension is provided,
|
|
3149
|
+
Pointblank will automatically load the file using the best available DataFrame library (Polars
|
|
3150
|
+
preferred, Pandas as fallback). The loaded data can then be used with all validation methods
|
|
3151
|
+
just like any other supported table type.
|
|
3152
|
+
|
|
3153
|
+
### Working with Parquet Files
|
|
3154
|
+
|
|
3155
|
+
The `Validate` class can directly accept Parquet files and datasets in various formats. The
|
|
3156
|
+
following examples illustrate how to validate Parquet files:
|
|
3157
|
+
|
|
3158
|
+
```{python}
|
|
3159
|
+
# Single Parquet file from package data
|
|
3160
|
+
parquet_path = pb.get_data_path("nycflights", "parquet")
|
|
3161
|
+
|
|
3162
|
+
validation_5 = (
|
|
3163
|
+
pb.Validate(
|
|
3164
|
+
data=parquet_path,
|
|
3165
|
+
tbl_name="NYC Flights Data"
|
|
3166
|
+
)
|
|
3167
|
+
.col_vals_not_null(["carrier", "origin", "dest"])
|
|
3168
|
+
.col_vals_gt(columns="distance", value=0)
|
|
3169
|
+
.interrogate()
|
|
3170
|
+
)
|
|
3171
|
+
|
|
3172
|
+
validation_5
|
|
3173
|
+
```
|
|
3174
|
+
|
|
3175
|
+
You can also use glob patterns and directories. Here are some examples for how to:
|
|
3176
|
+
|
|
3177
|
+
1. load multiple Parquet files
|
|
3178
|
+
2. load a Parquet-containing directory
|
|
3179
|
+
3. load a partitioned Parquet dataset
|
|
3180
|
+
|
|
3181
|
+
```python
|
|
3182
|
+
# Multiple Parquet files with glob patterns
|
|
3183
|
+
validation_6 = pb.Validate(data="data/sales_*.parquet")
|
|
3184
|
+
|
|
3185
|
+
# Directory containing Parquet files
|
|
3186
|
+
validation_7 = pb.Validate(data="parquet_data/")
|
|
3187
|
+
|
|
3188
|
+
# Partitioned Parquet dataset
|
|
3189
|
+
validation_8 = (
|
|
3190
|
+
pb.Validate(data="sales_data/") # Contains year=2023/quarter=Q1/region=US/sales.parquet
|
|
3191
|
+
.col_exists(["transaction_id", "amount", "year", "quarter", "region"])
|
|
3192
|
+
.interrogate()
|
|
3193
|
+
)
|
|
3194
|
+
```
|
|
3195
|
+
|
|
3196
|
+
When you point to a directory that contains a partitioned Parquet dataset (with subdirectories
|
|
3197
|
+
like `year=2023/quarter=Q1/region=US/`), Pointblank will automatically:
|
|
3198
|
+
|
|
3199
|
+
- discover all Parquet files recursively
|
|
3200
|
+
- extract partition column values from directory paths
|
|
3201
|
+
- add partition columns to the final DataFrame
|
|
3202
|
+
- combine all partitions into a single table for validation
|
|
3203
|
+
|
|
3204
|
+
Both Polars and Pandas handle partitioned datasets natively, so this works seamlessly with
|
|
3205
|
+
either DataFrame library. The loading preference is Polars first, then Pandas as a fallback.
|
|
3206
|
+
|
|
3207
|
+
### Working with Database Connection Strings
|
|
3208
|
+
|
|
3209
|
+
The `Validate` class supports database connection strings for direct validation of database
|
|
3210
|
+
tables. Connection strings must specify a table using the `::table_name` suffix:
|
|
3211
|
+
|
|
3212
|
+
```{python}
|
|
3213
|
+
# Get path to a DuckDB database file from package data
|
|
3214
|
+
duckdb_path = pb.get_data_path("game_revenue", "duckdb")
|
|
3215
|
+
|
|
3216
|
+
validation_9 = (
|
|
3217
|
+
pb.Validate(
|
|
3218
|
+
data=f"duckdb:///{duckdb_path}::game_revenue",
|
|
3219
|
+
label="DuckDB Game Revenue Validation"
|
|
3220
|
+
)
|
|
3221
|
+
.col_exists(["player_id", "session_id", "item_revenue"])
|
|
3222
|
+
.col_vals_gt(columns="item_revenue", value=0)
|
|
3223
|
+
.interrogate()
|
|
3224
|
+
)
|
|
3225
|
+
|
|
3226
|
+
validation_9
|
|
3227
|
+
```
|
|
3228
|
+
|
|
3229
|
+
For comprehensive documentation on supported connection string formats, error handling, and
|
|
3230
|
+
installation requirements, see the [`connect_to_table()`](`pointblank.connect_to_table`)
|
|
3231
|
+
function. This function handles all the connection logic and provides helpful error messages
|
|
3232
|
+
when table specifications are missing or backend dependencies are not installed.
|
|
2296
3233
|
"""
|
|
2297
3234
|
|
|
2298
3235
|
data: FrameT | Any
|
|
@@ -2306,6 +3243,15 @@ class Validate:
|
|
|
2306
3243
|
locale: str | None = None
|
|
2307
3244
|
|
|
2308
3245
|
def __post_init__(self):
|
|
3246
|
+
# Handle connection string input for the data parameter
|
|
3247
|
+
self.data = _process_connection_string(self.data)
|
|
3248
|
+
|
|
3249
|
+
# Handle CSV file input for the data parameter
|
|
3250
|
+
self.data = _process_csv_input(self.data)
|
|
3251
|
+
|
|
3252
|
+
# Handle Parquet file input for the data parameter
|
|
3253
|
+
self.data = _process_parquet_input(self.data)
|
|
3254
|
+
|
|
2309
3255
|
# Check input of the `thresholds=` argument
|
|
2310
3256
|
_check_thresholds(thresholds=self.thresholds)
|
|
2311
3257
|
|
|
@@ -2481,12 +3427,16 @@ class Validate:
|
|
|
2481
3427
|
(i.e., no validation steps will be created for them).
|
|
2482
3428
|
|
|
2483
3429
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
2484
|
-
for more complex segmentation scenarios. The following inputs are
|
|
3430
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
2485
3431
|
|
|
2486
|
-
|
|
2487
|
-
|
|
2488
|
-
|
|
2489
|
-
|
|
3432
|
+
```
|
|
3433
|
+
# Segments from all unique values in the `region` column
|
|
3434
|
+
# and specific dates in the `date` column
|
|
3435
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
3436
|
+
|
|
3437
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
3438
|
+
segments=["region", "date"]
|
|
3439
|
+
```
|
|
2490
3440
|
|
|
2491
3441
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
2492
3442
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -2769,12 +3719,16 @@ class Validate:
|
|
|
2769
3719
|
(i.e., no validation steps will be created for them).
|
|
2770
3720
|
|
|
2771
3721
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
2772
|
-
for more complex segmentation scenarios. The following inputs are
|
|
3722
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
2773
3723
|
|
|
2774
|
-
|
|
2775
|
-
|
|
2776
|
-
|
|
2777
|
-
|
|
3724
|
+
```
|
|
3725
|
+
# Segments from all unique values in the `region` column
|
|
3726
|
+
# and specific dates in the `date` column
|
|
3727
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
3728
|
+
|
|
3729
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
3730
|
+
segments=["region", "date"]
|
|
3731
|
+
```
|
|
2778
3732
|
|
|
2779
3733
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
2780
3734
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -3056,12 +4010,16 @@ class Validate:
|
|
|
3056
4010
|
(i.e., no validation steps will be created for them).
|
|
3057
4011
|
|
|
3058
4012
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3059
|
-
for more complex segmentation scenarios. The following inputs are
|
|
4013
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
3060
4014
|
|
|
3061
|
-
|
|
3062
|
-
|
|
3063
|
-
|
|
3064
|
-
|
|
4015
|
+
```
|
|
4016
|
+
# Segments from all unique values in the `region` column
|
|
4017
|
+
# and specific dates in the `date` column
|
|
4018
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
4019
|
+
|
|
4020
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
4021
|
+
segments=["region", "date"]
|
|
4022
|
+
```
|
|
3065
4023
|
|
|
3066
4024
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3067
4025
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -3342,12 +4300,16 @@ class Validate:
|
|
|
3342
4300
|
(i.e., no validation steps will be created for them).
|
|
3343
4301
|
|
|
3344
4302
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3345
|
-
for more complex segmentation scenarios. The following inputs are
|
|
4303
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
3346
4304
|
|
|
3347
|
-
|
|
3348
|
-
|
|
3349
|
-
|
|
3350
|
-
|
|
4305
|
+
```
|
|
4306
|
+
# Segments from all unique values in the `region` column
|
|
4307
|
+
# and specific dates in the `date` column
|
|
4308
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
4309
|
+
|
|
4310
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
4311
|
+
segments=["region", "date"]
|
|
4312
|
+
```
|
|
3351
4313
|
|
|
3352
4314
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3353
4315
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -3626,12 +4588,16 @@ class Validate:
|
|
|
3626
4588
|
(i.e., no validation steps will be created for them).
|
|
3627
4589
|
|
|
3628
4590
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3629
|
-
for more complex segmentation scenarios. The following inputs are
|
|
4591
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
3630
4592
|
|
|
3631
|
-
|
|
3632
|
-
|
|
3633
|
-
|
|
3634
|
-
|
|
4593
|
+
```
|
|
4594
|
+
# Segments from all unique values in the `region` column
|
|
4595
|
+
# and specific dates in the `date` column
|
|
4596
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
4597
|
+
|
|
4598
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
4599
|
+
segments=["region", "date"]
|
|
4600
|
+
```
|
|
3635
4601
|
|
|
3636
4602
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3637
4603
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -3914,12 +4880,16 @@ class Validate:
|
|
|
3914
4880
|
(i.e., no validation steps will be created for them).
|
|
3915
4881
|
|
|
3916
4882
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3917
|
-
for more complex segmentation scenarios. The following inputs are
|
|
4883
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
3918
4884
|
|
|
3919
|
-
|
|
3920
|
-
|
|
3921
|
-
|
|
3922
|
-
|
|
4885
|
+
```
|
|
4886
|
+
# Segments from all unique values in the `region` column
|
|
4887
|
+
# and specific dates in the `date` column
|
|
4888
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
4889
|
+
|
|
4890
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
4891
|
+
segments=["region", "date"]
|
|
4892
|
+
```
|
|
3923
4893
|
|
|
3924
4894
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3925
4895
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -4216,12 +5186,16 @@ class Validate:
|
|
|
4216
5186
|
(i.e., no validation steps will be created for them).
|
|
4217
5187
|
|
|
4218
5188
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
4219
|
-
for more complex segmentation scenarios. The following inputs are
|
|
5189
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
4220
5190
|
|
|
4221
|
-
|
|
4222
|
-
|
|
4223
|
-
|
|
4224
|
-
|
|
5191
|
+
```
|
|
5192
|
+
# Segments from all unique values in the `region` column
|
|
5193
|
+
# and specific dates in the `date` column
|
|
5194
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
5195
|
+
|
|
5196
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
5197
|
+
segments=["region", "date"]
|
|
5198
|
+
```
|
|
4225
5199
|
|
|
4226
5200
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
4227
5201
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -4532,12 +5506,16 @@ class Validate:
|
|
|
4532
5506
|
(i.e., no validation steps will be created for them).
|
|
4533
5507
|
|
|
4534
5508
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
4535
|
-
for more complex segmentation scenarios. The following inputs are
|
|
5509
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
4536
5510
|
|
|
4537
|
-
|
|
4538
|
-
|
|
4539
|
-
|
|
4540
|
-
|
|
5511
|
+
```
|
|
5512
|
+
# Segments from all unique values in the `region` column
|
|
5513
|
+
# and specific dates in the `date` column
|
|
5514
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
5515
|
+
|
|
5516
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
5517
|
+
segments=["region", "date"]
|
|
5518
|
+
```
|
|
4541
5519
|
|
|
4542
5520
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
4543
5521
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -4804,12 +5782,16 @@ class Validate:
|
|
|
4804
5782
|
(i.e., no validation steps will be created for them).
|
|
4805
5783
|
|
|
4806
5784
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
4807
|
-
for more complex segmentation scenarios. The following inputs are
|
|
5785
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
4808
5786
|
|
|
4809
|
-
|
|
4810
|
-
|
|
4811
|
-
|
|
4812
|
-
|
|
5787
|
+
```
|
|
5788
|
+
# Segments from all unique values in the `region` column
|
|
5789
|
+
# and specific dates in the `date` column
|
|
5790
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
5791
|
+
|
|
5792
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
5793
|
+
segments=["region", "date"]
|
|
5794
|
+
```
|
|
4813
5795
|
|
|
4814
5796
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
4815
5797
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -5057,12 +6039,16 @@ class Validate:
|
|
|
5057
6039
|
(i.e., no validation steps will be created for them).
|
|
5058
6040
|
|
|
5059
6041
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
5060
|
-
for more complex segmentation scenarios. The following inputs are
|
|
6042
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
5061
6043
|
|
|
5062
|
-
|
|
5063
|
-
|
|
5064
|
-
|
|
5065
|
-
|
|
6044
|
+
```
|
|
6045
|
+
# Segments from all unique values in the `region` column
|
|
6046
|
+
# and specific dates in the `date` column
|
|
6047
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
6048
|
+
|
|
6049
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
6050
|
+
segments=["region", "date"]
|
|
6051
|
+
```
|
|
5066
6052
|
|
|
5067
6053
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
5068
6054
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -5218,9 +6204,9 @@ class Validate:
|
|
|
5218
6204
|
active: bool = True,
|
|
5219
6205
|
) -> Validate:
|
|
5220
6206
|
"""
|
|
5221
|
-
Validate whether values in a column are
|
|
6207
|
+
Validate whether values in a column are Null.
|
|
5222
6208
|
|
|
5223
|
-
The `col_vals_null()` validation method checks whether column values in a table are
|
|
6209
|
+
The `col_vals_null()` validation method checks whether column values in a table are Null.
|
|
5224
6210
|
This validation will operate over the number of test units that is equal to the number
|
|
5225
6211
|
of rows in the table.
|
|
5226
6212
|
|
|
@@ -5301,12 +6287,16 @@ class Validate:
|
|
|
5301
6287
|
(i.e., no validation steps will be created for them).
|
|
5302
6288
|
|
|
5303
6289
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
5304
|
-
for more complex segmentation scenarios. The following inputs are
|
|
6290
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
5305
6291
|
|
|
5306
|
-
|
|
5307
|
-
|
|
5308
|
-
|
|
5309
|
-
|
|
6292
|
+
```
|
|
6293
|
+
# Segments from all unique values in the `region` column
|
|
6294
|
+
# and specific dates in the `date` column
|
|
6295
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
6296
|
+
|
|
6297
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
6298
|
+
segments=["region", "date"]
|
|
6299
|
+
```
|
|
5310
6300
|
|
|
5311
6301
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
5312
6302
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -5457,10 +6447,10 @@ class Validate:
|
|
|
5457
6447
|
active: bool = True,
|
|
5458
6448
|
) -> Validate:
|
|
5459
6449
|
"""
|
|
5460
|
-
Validate whether values in a column are not
|
|
6450
|
+
Validate whether values in a column are not Null.
|
|
5461
6451
|
|
|
5462
6452
|
The `col_vals_not_null()` validation method checks whether column values in a table are not
|
|
5463
|
-
|
|
6453
|
+
Null. This validation will operate over the number of test units that is equal to the number
|
|
5464
6454
|
of rows in the table.
|
|
5465
6455
|
|
|
5466
6456
|
Parameters
|
|
@@ -5540,12 +6530,16 @@ class Validate:
|
|
|
5540
6530
|
(i.e., no validation steps will be created for them).
|
|
5541
6531
|
|
|
5542
6532
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
5543
|
-
for more complex segmentation scenarios. The following inputs are
|
|
6533
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
5544
6534
|
|
|
5545
|
-
|
|
5546
|
-
|
|
5547
|
-
|
|
5548
|
-
|
|
6535
|
+
```
|
|
6536
|
+
# Segments from all unique values in the `region` column
|
|
6537
|
+
# and specific dates in the `date` column
|
|
6538
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
6539
|
+
|
|
6540
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
6541
|
+
segments=["region", "date"]
|
|
6542
|
+
```
|
|
5549
6543
|
|
|
5550
6544
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
5551
6545
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -5787,12 +6781,16 @@ class Validate:
|
|
|
5787
6781
|
(i.e., no validation steps will be created for them).
|
|
5788
6782
|
|
|
5789
6783
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
5790
|
-
for more complex segmentation scenarios. The following inputs are
|
|
6784
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
5791
6785
|
|
|
5792
|
-
|
|
5793
|
-
|
|
5794
|
-
|
|
5795
|
-
|
|
6786
|
+
```
|
|
6787
|
+
# Segments from all unique values in the `region` column
|
|
6788
|
+
# and specific dates in the `date` column
|
|
6789
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
6790
|
+
|
|
6791
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
6792
|
+
segments=["region", "date"]
|
|
6793
|
+
```
|
|
5796
6794
|
|
|
5797
6795
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
5798
6796
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -6030,12 +7028,16 @@ class Validate:
|
|
|
6030
7028
|
(i.e., no validation steps will be created for them).
|
|
6031
7029
|
|
|
6032
7030
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
6033
|
-
for more complex segmentation scenarios. The following inputs are
|
|
7031
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
6034
7032
|
|
|
6035
|
-
|
|
6036
|
-
|
|
6037
|
-
|
|
6038
|
-
|
|
7033
|
+
```
|
|
7034
|
+
# Segments from all unique values in the `region` column
|
|
7035
|
+
# and specific dates in the `date` column
|
|
7036
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
7037
|
+
|
|
7038
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
7039
|
+
segments=["region", "date"]
|
|
7040
|
+
```
|
|
6039
7041
|
|
|
6040
7042
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
6041
7043
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -6421,12 +7423,16 @@ class Validate:
|
|
|
6421
7423
|
(i.e., no validation steps will be created for them).
|
|
6422
7424
|
|
|
6423
7425
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
6424
|
-
for more complex segmentation scenarios. The following inputs are
|
|
7426
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
6425
7427
|
|
|
6426
|
-
|
|
6427
|
-
|
|
6428
|
-
|
|
6429
|
-
|
|
7428
|
+
```
|
|
7429
|
+
# Segments from all unique values in the `region` column
|
|
7430
|
+
# and specific dates in the `date` column
|
|
7431
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
7432
|
+
|
|
7433
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
7434
|
+
segments=["region", "date"]
|
|
7435
|
+
```
|
|
6430
7436
|
|
|
6431
7437
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
6432
7438
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -6658,12 +7664,16 @@ class Validate:
|
|
|
6658
7664
|
(i.e., no validation steps will be created for them).
|
|
6659
7665
|
|
|
6660
7666
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
6661
|
-
for more complex segmentation scenarios. The following inputs are
|
|
7667
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
6662
7668
|
|
|
6663
|
-
|
|
6664
|
-
|
|
6665
|
-
|
|
6666
|
-
|
|
7669
|
+
```
|
|
7670
|
+
# Segments from all unique values in the `region` column
|
|
7671
|
+
# and specific dates in the `date` column
|
|
7672
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
7673
|
+
|
|
7674
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
7675
|
+
segments=["region", "date"]
|
|
7676
|
+
```
|
|
6667
7677
|
|
|
6668
7678
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
6669
7679
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -8216,37 +9226,47 @@ class Validate:
|
|
|
8216
9226
|
|
|
8217
9227
|
# Determine whether any preprocessing functions are to be applied to the table
|
|
8218
9228
|
if validation.pre is not None:
|
|
8219
|
-
|
|
8220
|
-
|
|
9229
|
+
try:
|
|
9230
|
+
# Read the text of the preprocessing function
|
|
9231
|
+
pre_text = _pre_processing_funcs_to_str(validation.pre)
|
|
8221
9232
|
|
|
8222
|
-
|
|
8223
|
-
|
|
9233
|
+
# Determine if the preprocessing function is a lambda function; return a boolean
|
|
9234
|
+
is_lambda = re.match(r"^lambda", pre_text) is not None
|
|
8224
9235
|
|
|
8225
|
-
|
|
8226
|
-
|
|
8227
|
-
|
|
8228
|
-
|
|
8229
|
-
|
|
8230
|
-
|
|
9236
|
+
# If the preprocessing function is a lambda function, then check if there is
|
|
9237
|
+
# a keyword argument called `dfn` in the lamda signature; if so, that's a cue
|
|
9238
|
+
# to use a Narwhalified version of the table
|
|
9239
|
+
if is_lambda:
|
|
9240
|
+
# Get the signature of the lambda function
|
|
9241
|
+
sig = inspect.signature(validation.pre)
|
|
8231
9242
|
|
|
8232
|
-
|
|
8233
|
-
|
|
8234
|
-
|
|
8235
|
-
|
|
9243
|
+
# Check if the lambda function has a keyword argument called `dfn`
|
|
9244
|
+
if "dfn" in sig.parameters:
|
|
9245
|
+
# Convert the table to a Narwhals DataFrame
|
|
9246
|
+
data_tbl_step = nw.from_native(data_tbl_step)
|
|
8236
9247
|
|
|
8237
|
-
|
|
8238
|
-
|
|
9248
|
+
# Apply the preprocessing function to the table
|
|
9249
|
+
data_tbl_step = validation.pre(dfn=data_tbl_step)
|
|
8239
9250
|
|
|
8240
|
-
|
|
8241
|
-
|
|
9251
|
+
# Convert the table back to its original format
|
|
9252
|
+
data_tbl_step = nw.to_native(data_tbl_step)
|
|
8242
9253
|
|
|
8243
|
-
|
|
8244
|
-
|
|
9254
|
+
else:
|
|
9255
|
+
# Apply the preprocessing function to the table
|
|
9256
|
+
data_tbl_step = validation.pre(data_tbl_step)
|
|
9257
|
+
|
|
9258
|
+
# If the preprocessing function is a function, apply it to the table
|
|
9259
|
+
elif isinstance(validation.pre, Callable):
|
|
8245
9260
|
data_tbl_step = validation.pre(data_tbl_step)
|
|
8246
9261
|
|
|
8247
|
-
|
|
8248
|
-
|
|
8249
|
-
|
|
9262
|
+
except Exception:
|
|
9263
|
+
# If preprocessing fails, mark the validation as having an eval_error
|
|
9264
|
+
validation.eval_error = True
|
|
9265
|
+
end_time = datetime.datetime.now(datetime.timezone.utc)
|
|
9266
|
+
validation.proc_duration_s = (end_time - start_time).total_seconds()
|
|
9267
|
+
validation.time_processed = end_time.isoformat(timespec="milliseconds")
|
|
9268
|
+
validation.active = False
|
|
9269
|
+
continue
|
|
8250
9270
|
|
|
8251
9271
|
# ------------------------------------------------
|
|
8252
9272
|
# Segmentation stage
|
|
@@ -8259,12 +9279,28 @@ class Validate:
|
|
|
8259
9279
|
data_tbl=data_tbl_step, segments_expr=validation.segments
|
|
8260
9280
|
)
|
|
8261
9281
|
|
|
9282
|
+
# ------------------------------------------------
|
|
9283
|
+
# Determine table type and `collect()` if needed
|
|
9284
|
+
# ------------------------------------------------
|
|
9285
|
+
|
|
9286
|
+
if tbl_type not in IBIS_BACKENDS:
|
|
9287
|
+
tbl_type = "local"
|
|
9288
|
+
|
|
9289
|
+
# If the table is a lazy frame, we need to collect it
|
|
9290
|
+
if _is_lazy_frame(data_tbl_step):
|
|
9291
|
+
data_tbl_step = data_tbl_step.collect()
|
|
9292
|
+
|
|
9293
|
+
# ------------------------------------------------
|
|
9294
|
+
# Set the number of test units
|
|
9295
|
+
# ------------------------------------------------
|
|
9296
|
+
|
|
8262
9297
|
validation.n = NumberOfTestUnits(df=data_tbl_step, column=column).get_test_units(
|
|
8263
9298
|
tbl_type=tbl_type
|
|
8264
9299
|
)
|
|
8265
9300
|
|
|
8266
|
-
|
|
8267
|
-
|
|
9301
|
+
# ------------------------------------------------
|
|
9302
|
+
# Validation stage
|
|
9303
|
+
# ------------------------------------------------
|
|
8268
9304
|
|
|
8269
9305
|
if assertion_category == "COMPARE_ONE":
|
|
8270
9306
|
results_tbl = ColValsCompareOne(
|
|
@@ -8455,36 +9491,32 @@ class Validate:
|
|
|
8455
9491
|
|
|
8456
9492
|
else:
|
|
8457
9493
|
# If the result is not a list, then we assume it's a table in the conventional
|
|
8458
|
-
# form (where the column is `pb_is_good_` exists, with boolean values
|
|
8459
|
-
|
|
9494
|
+
# form (where the column is `pb_is_good_` exists, with boolean values
|
|
8460
9495
|
results_tbl = results_tbl_list
|
|
8461
9496
|
|
|
8462
9497
|
# If the results table is not `None`, then we assume there is a table with a column
|
|
8463
9498
|
# called `pb_is_good_` that contains boolean values; we can then use this table to
|
|
8464
9499
|
# determine the number of test units that passed and failed
|
|
8465
9500
|
if results_tbl is not None:
|
|
8466
|
-
#
|
|
8467
|
-
|
|
8468
|
-
|
|
8469
|
-
|
|
8470
|
-
|
|
8471
|
-
|
|
8472
|
-
|
|
8473
|
-
results_list = (
|
|
8474
|
-
results_tbl.select("pb_is_good_").to_pandas()["pb_is_good_"].to_list()
|
|
8475
|
-
)
|
|
8476
|
-
else:
|
|
8477
|
-
results_list = (
|
|
8478
|
-
results_tbl.select("pb_is_good_").to_polars()["pb_is_good_"].to_list()
|
|
8479
|
-
)
|
|
9501
|
+
# Count the number of passing and failing test units
|
|
9502
|
+
validation.n_passed = _count_true_values_in_column(
|
|
9503
|
+
tbl=results_tbl, column="pb_is_good_"
|
|
9504
|
+
)
|
|
9505
|
+
validation.n_failed = _count_true_values_in_column(
|
|
9506
|
+
tbl=results_tbl, column="pb_is_good_", inverse=True
|
|
9507
|
+
)
|
|
8480
9508
|
|
|
8481
|
-
|
|
8482
|
-
|
|
9509
|
+
# Solely for the col_vals_in_set assertion type, any Null values in the
|
|
9510
|
+
# `pb_is_good_` column are counted as failing test units
|
|
9511
|
+
if assertion_type == "col_vals_in_set":
|
|
9512
|
+
null_count = _count_null_values_in_column(tbl=results_tbl, column="pb_is_good_")
|
|
9513
|
+
validation.n_failed += null_count
|
|
9514
|
+
|
|
9515
|
+
# For column-value validations, the number of test units is the number of rows
|
|
9516
|
+
validation.n = get_row_count(data=results_tbl)
|
|
8483
9517
|
|
|
8484
|
-
|
|
8485
|
-
validation.
|
|
8486
|
-
validation.n_passed = results_list.count(True)
|
|
8487
|
-
validation.n_failed = results_list.count(False)
|
|
9518
|
+
# Set the `all_passed` attribute based on whether there are any failing test units
|
|
9519
|
+
validation.all_passed = validation.n_failed == 0
|
|
8488
9520
|
|
|
8489
9521
|
# Calculate fractions of passing and failing test units
|
|
8490
9522
|
# - `f_passed` is the fraction of test units that passed
|
|
@@ -8831,7 +9863,7 @@ class Validate:
|
|
|
8831
9863
|
raise AssertionError(msg)
|
|
8832
9864
|
|
|
8833
9865
|
def assert_below_threshold(
|
|
8834
|
-
self, level: str = "warning", i: int = None, message: str = None
|
|
9866
|
+
self, level: str = "warning", i: int | None = None, message: str | None = None
|
|
8835
9867
|
) -> None:
|
|
8836
9868
|
"""
|
|
8837
9869
|
Raise an `AssertionError` if validation steps exceed a specified threshold level.
|
|
@@ -8940,12 +9972,12 @@ class Validate:
|
|
|
8940
9972
|
|
|
8941
9973
|
See Also
|
|
8942
9974
|
--------
|
|
8943
|
-
- [`warning()`](`pointblank.Validate.warning`):
|
|
9975
|
+
- [`warning()`](`pointblank.Validate.warning`): get the 'warning' status for each validation
|
|
8944
9976
|
step
|
|
8945
|
-
- [`error()`](`pointblank.Validate.error`):
|
|
8946
|
-
- [`critical()`](`pointblank.Validate.critical`):
|
|
9977
|
+
- [`error()`](`pointblank.Validate.error`): get the 'error' status for each validation step
|
|
9978
|
+
- [`critical()`](`pointblank.Validate.critical`): get the 'critical' status for each
|
|
8947
9979
|
validation step
|
|
8948
|
-
- [`assert_passing()`](`pointblank.Validate.assert_passing`):
|
|
9980
|
+
- [`assert_passing()`](`pointblank.Validate.assert_passing`): assert all validations pass
|
|
8949
9981
|
completely
|
|
8950
9982
|
"""
|
|
8951
9983
|
# Check if validation has been interrogated
|
|
@@ -8991,6 +10023,145 @@ class Validate:
|
|
|
8991
10023
|
)
|
|
8992
10024
|
raise AssertionError(msg)
|
|
8993
10025
|
|
|
10026
|
+
def above_threshold(self, level: str = "warning", i: int | None = None) -> bool:
|
|
10027
|
+
"""
|
|
10028
|
+
Check if any validation steps exceed a specified threshold level.
|
|
10029
|
+
|
|
10030
|
+
The `above_threshold()` method checks whether validation steps exceed a given threshold
|
|
10031
|
+
level. This provides a non-exception-based alternative to
|
|
10032
|
+
[`assert_below_threshold()`](`pointblank.Validate.assert_below_threshold`) for conditional
|
|
10033
|
+
workflow control based on validation results.
|
|
10034
|
+
|
|
10035
|
+
This method is useful in scenarios where you want to check if any validation steps failed
|
|
10036
|
+
beyond a certain threshold without raising an exception, allowing for more flexible
|
|
10037
|
+
programmatic responses to validation issues.
|
|
10038
|
+
|
|
10039
|
+
Parameters
|
|
10040
|
+
----------
|
|
10041
|
+
level
|
|
10042
|
+
The threshold level to check against. Valid options are: `"warning"` (the least severe
|
|
10043
|
+
threshold level), `"error"` (the middle severity threshold level), and `"critical"` (the
|
|
10044
|
+
most severe threshold level). The default is `"warning"`.
|
|
10045
|
+
i
|
|
10046
|
+
Specific validation step number(s) to check. If a single integer, checks only that step.
|
|
10047
|
+
If a list of integers, checks all specified steps. If `None` (the default), checks all
|
|
10048
|
+
validation steps. Step numbers are 1-based (first step is `1`, not `0`).
|
|
10049
|
+
|
|
10050
|
+
Returns
|
|
10051
|
+
-------
|
|
10052
|
+
bool
|
|
10053
|
+
`True` if any of the specified validation steps exceed the given threshold level,
|
|
10054
|
+
`False` otherwise.
|
|
10055
|
+
|
|
10056
|
+
Raises
|
|
10057
|
+
------
|
|
10058
|
+
ValueError
|
|
10059
|
+
If an invalid threshold level is provided.
|
|
10060
|
+
|
|
10061
|
+
Examples
|
|
10062
|
+
--------
|
|
10063
|
+
```{python}
|
|
10064
|
+
#| echo: false
|
|
10065
|
+
#| output: false
|
|
10066
|
+
import pointblank as pb
|
|
10067
|
+
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
10068
|
+
```
|
|
10069
|
+
Below are some examples of how to use the `above_threshold()` method. First, we'll create a
|
|
10070
|
+
simple Polars DataFrame with a single column (`values`).
|
|
10071
|
+
|
|
10072
|
+
```{python}
|
|
10073
|
+
import polars as pl
|
|
10074
|
+
|
|
10075
|
+
tbl = pl.DataFrame({
|
|
10076
|
+
"values": [1, 2, 3, 4, 5, 0, -1]
|
|
10077
|
+
})
|
|
10078
|
+
```
|
|
10079
|
+
|
|
10080
|
+
Then a validation plan will be created with thresholds (`warning=0.1`, `error=0.2`,
|
|
10081
|
+
`critical=0.3`). After interrogating, we display the validation report table:
|
|
10082
|
+
|
|
10083
|
+
```{python}
|
|
10084
|
+
import pointblank as pb
|
|
10085
|
+
|
|
10086
|
+
validation = (
|
|
10087
|
+
pb.Validate(data=tbl, thresholds=(0.1, 0.2, 0.3))
|
|
10088
|
+
.col_vals_gt(columns="values", value=0)
|
|
10089
|
+
.col_vals_lt(columns="values", value=10)
|
|
10090
|
+
.col_vals_between(columns="values", left=0, right=5)
|
|
10091
|
+
.interrogate()
|
|
10092
|
+
)
|
|
10093
|
+
|
|
10094
|
+
validation
|
|
10095
|
+
```
|
|
10096
|
+
|
|
10097
|
+
Let's check if any steps exceed the 'warning' threshold with the `above_threshold()` method.
|
|
10098
|
+
A message will be printed if that's the case:
|
|
10099
|
+
|
|
10100
|
+
```{python}
|
|
10101
|
+
if validation.above_threshold(level="warning"):
|
|
10102
|
+
print("Some steps have exceeded the warning threshold")
|
|
10103
|
+
```
|
|
10104
|
+
|
|
10105
|
+
Check if only steps 2 and 3 exceed the 'error' threshold through use of the `i=` argument:
|
|
10106
|
+
|
|
10107
|
+
```{python}
|
|
10108
|
+
if validation.above_threshold(level="error", i=[2, 3]):
|
|
10109
|
+
print("Steps 2 and/or 3 have exceeded the error threshold")
|
|
10110
|
+
```
|
|
10111
|
+
|
|
10112
|
+
You can use this in a workflow to conditionally trigger processes. Here's a snippet of how
|
|
10113
|
+
you might use this in a function:
|
|
10114
|
+
|
|
10115
|
+
```python
|
|
10116
|
+
def process_data(validation_obj):
|
|
10117
|
+
# Only continue processing if validation passes critical thresholds
|
|
10118
|
+
if not validation_obj.above_threshold(level="critical"):
|
|
10119
|
+
# Continue with processing
|
|
10120
|
+
print("Data meets critical quality thresholds, proceeding...")
|
|
10121
|
+
return True
|
|
10122
|
+
else:
|
|
10123
|
+
# Log failure and stop processing
|
|
10124
|
+
print("Data fails critical quality checks, aborting...")
|
|
10125
|
+
return False
|
|
10126
|
+
```
|
|
10127
|
+
|
|
10128
|
+
Note that this is just a suggestion for how to implement conditional workflow processes. You
|
|
10129
|
+
should adapt this pattern to your specific requirements, which might include different
|
|
10130
|
+
threshold levels, custom logging mechanisms, or integration with your organization's data
|
|
10131
|
+
pipelines and notification systems.
|
|
10132
|
+
|
|
10133
|
+
See Also
|
|
10134
|
+
--------
|
|
10135
|
+
- [`assert_below_threshold()`](`pointblank.Validate.assert_below_threshold`): a similar
|
|
10136
|
+
method that raises an exception if thresholds are exceeded
|
|
10137
|
+
- [`warning()`](`pointblank.Validate.warning`): get the 'warning' status for each validation
|
|
10138
|
+
step
|
|
10139
|
+
- [`error()`](`pointblank.Validate.error`): get the 'error' status for each validation step
|
|
10140
|
+
- [`critical()`](`pointblank.Validate.critical`): get the 'critical' status for each
|
|
10141
|
+
validation step
|
|
10142
|
+
"""
|
|
10143
|
+
# Ensure validation has been run
|
|
10144
|
+
if not hasattr(self, "time_start") or self.time_start is None:
|
|
10145
|
+
return False
|
|
10146
|
+
|
|
10147
|
+
# Validate the level parameter
|
|
10148
|
+
level = level.lower()
|
|
10149
|
+
if level not in ["warning", "error", "critical"]:
|
|
10150
|
+
raise ValueError(
|
|
10151
|
+
f"Invalid threshold level: {level}. Must be one of 'warning', 'error', or 'critical'."
|
|
10152
|
+
)
|
|
10153
|
+
|
|
10154
|
+
# Get the threshold status using the appropriate method
|
|
10155
|
+
if level == "warning":
|
|
10156
|
+
status = self.warning(i=i)
|
|
10157
|
+
elif level == "error":
|
|
10158
|
+
status = self.error(i=i)
|
|
10159
|
+
elif level == "critical":
|
|
10160
|
+
status = self.critical(i=i)
|
|
10161
|
+
|
|
10162
|
+
# Return True if any steps exceeded the threshold
|
|
10163
|
+
return any(status.values())
|
|
10164
|
+
|
|
8994
10165
|
def n(self, i: int | list[int] | None = None, scalar: bool = False) -> dict[int, int] | int:
|
|
8995
10166
|
"""
|
|
8996
10167
|
Provides a dictionary of the number of test units for each validation step.
|
|
@@ -9654,7 +10825,7 @@ class Validate:
|
|
|
9654
10825
|
Get the 'critical' level status for each validation step.
|
|
9655
10826
|
|
|
9656
10827
|
The 'critical' status for a validation step is `True` if the fraction of failing test units
|
|
9657
|
-
meets or exceeds the threshold for the
|
|
10828
|
+
meets or exceeds the threshold for the 'critical' level. Otherwise, the status is `False`.
|
|
9658
10829
|
|
|
9659
10830
|
The ascribed name of 'critical' is semantic and is thus simply a status indicator that could
|
|
9660
10831
|
be used to trigger some action to be take. Here's how it fits in with other status
|
|
@@ -9666,14 +10837,14 @@ class Validate:
|
|
|
9666
10837
|
severity
|
|
9667
10838
|
- 'critical': the status obtained by calling `critical()`, most severe
|
|
9668
10839
|
|
|
9669
|
-
This method provides a dictionary of the
|
|
9670
|
-
|
|
9671
|
-
|
|
10840
|
+
This method provides a dictionary of the 'critical' status for each validation step. If the
|
|
10841
|
+
`scalar=True` argument is provided and `i=` is a scalar, the value is returned as a scalar
|
|
10842
|
+
instead of a dictionary.
|
|
9672
10843
|
|
|
9673
10844
|
Parameters
|
|
9674
10845
|
----------
|
|
9675
10846
|
i
|
|
9676
|
-
The validation step number(s) from which the
|
|
10847
|
+
The validation step number(s) from which the 'critical' status is obtained. Can be
|
|
9677
10848
|
provided as a list of integers or a single integer. If `None`, all steps are included.
|
|
9678
10849
|
scalar
|
|
9679
10850
|
If `True` and `i=` is a scalar, return the value as a scalar instead of a dictionary.
|
|
@@ -9681,7 +10852,7 @@ class Validate:
|
|
|
9681
10852
|
Returns
|
|
9682
10853
|
-------
|
|
9683
10854
|
dict[int, bool] | bool
|
|
9684
|
-
A dictionary of the
|
|
10855
|
+
A dictionary of the 'critical' status for each validation step or a scalar value.
|
|
9685
10856
|
|
|
9686
10857
|
Examples
|
|
9687
10858
|
--------
|
|
@@ -9760,11 +10931,13 @@ class Validate:
|
|
|
9760
10931
|
Get the rows that failed for each validation step.
|
|
9761
10932
|
|
|
9762
10933
|
After the [`interrogate()`](`pointblank.Validate.interrogate`) method has been called, the
|
|
9763
|
-
`get_data_extracts()` method can be used to extract the rows that failed in each
|
|
9764
|
-
validation step (e.g.,
|
|
9765
|
-
|
|
9766
|
-
|
|
9767
|
-
|
|
10934
|
+
`get_data_extracts()` method can be used to extract the rows that failed in each
|
|
10935
|
+
column-value or row-based validation step (e.g.,
|
|
10936
|
+
[`col_vals_gt()`](`pointblank.Validate.col_vals_gt`),
|
|
10937
|
+
[`rows_distinct()`](`pointblank.Validate.rows_distinct`), etc.). The method returns a
|
|
10938
|
+
dictionary of tables containing the rows that failed in every validation step. If
|
|
10939
|
+
`frame=True` and `i=` is a scalar, the value is conveniently returned as a table (forgoing
|
|
10940
|
+
the dictionary structure).
|
|
9768
10941
|
|
|
9769
10942
|
Parameters
|
|
9770
10943
|
----------
|
|
@@ -9777,13 +10950,13 @@ class Validate:
|
|
|
9777
10950
|
Returns
|
|
9778
10951
|
-------
|
|
9779
10952
|
dict[int, FrameT | None] | FrameT | None
|
|
9780
|
-
A dictionary of tables containing the rows that failed in every
|
|
9781
|
-
step
|
|
10953
|
+
A dictionary of tables containing the rows that failed in every compatible validation
|
|
10954
|
+
step. Alternatively, it can be a DataFrame if `frame=True` and `i=` is a scalar.
|
|
9782
10955
|
|
|
9783
|
-
Validation Methods
|
|
9784
|
-
|
|
9785
|
-
The following validation methods
|
|
9786
|
-
failing test units.
|
|
10956
|
+
Compatible Validation Methods for Yielding Extracted Rows
|
|
10957
|
+
---------------------------------------------------------
|
|
10958
|
+
The following validation methods operate on column values and will have rows extracted when
|
|
10959
|
+
there are failing test units.
|
|
9787
10960
|
|
|
9788
10961
|
- [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`)
|
|
9789
10962
|
- [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
|
|
@@ -9798,11 +10971,20 @@ class Validate:
|
|
|
9798
10971
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
9799
10972
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
9800
10973
|
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
10974
|
+
- [`col_vals_expr()`](`pointblank.Validate.col_vals_expr`)
|
|
10975
|
+
- [`conjointly()`](`pointblank.Validate.conjointly`)
|
|
10976
|
+
|
|
10977
|
+
An extracted row for these validation methods means that a test unit failed for that row in
|
|
10978
|
+
the validation step.
|
|
10979
|
+
|
|
10980
|
+
These row-based validation methods will also have rows extracted should there be failing
|
|
10981
|
+
rows:
|
|
10982
|
+
|
|
9801
10983
|
- [`rows_distinct()`](`pointblank.Validate.rows_distinct`)
|
|
10984
|
+
- [`rows_complete()`](`pointblank.Validate.rows_complete`)
|
|
9802
10985
|
|
|
9803
|
-
|
|
9804
|
-
|
|
9805
|
-
understanding the nature of the failing test units.
|
|
10986
|
+
The extracted rows are a subset of the original table and are useful for further analysis
|
|
10987
|
+
or for understanding the nature of the failing test units.
|
|
9806
10988
|
|
|
9807
10989
|
Examples
|
|
9808
10990
|
--------
|
|
@@ -10058,10 +11240,10 @@ class Validate:
|
|
|
10058
11240
|
Get the data that passed or failed the validation steps.
|
|
10059
11241
|
|
|
10060
11242
|
Validation of the data is one thing but, sometimes, you want to use the best part of the
|
|
10061
|
-
input dataset for something else. The `get_sundered_data()` method works with a Validate
|
|
11243
|
+
input dataset for something else. The `get_sundered_data()` method works with a `Validate`
|
|
10062
11244
|
object that has been interrogated (i.e., the
|
|
10063
11245
|
[`interrogate()`](`pointblank.Validate.interrogate`) method was used). We can get either the
|
|
10064
|
-
'pass' data piece (rows with no failing test units across all
|
|
11246
|
+
'pass' data piece (rows with no failing test units across all column-value based validation
|
|
10065
11247
|
functions), or, the 'fail' data piece (rows with at least one failing test unit across the
|
|
10066
11248
|
same series of validations).
|
|
10067
11249
|
|
|
@@ -10070,7 +11252,7 @@ class Validate:
|
|
|
10070
11252
|
There are some caveats to sundering. The validation steps considered for this splitting will
|
|
10071
11253
|
only involve steps where:
|
|
10072
11254
|
|
|
10073
|
-
- of certain check types, where test units are cells checked
|
|
11255
|
+
- of certain check types, where test units are cells checked down a column (e.g., the
|
|
10074
11256
|
`col_vals_*()` methods)
|
|
10075
11257
|
- `active=` is not set to `False`
|
|
10076
11258
|
- `pre=` has not been given an expression for modifying the input table
|
|
@@ -10301,6 +11483,19 @@ class Validate:
|
|
|
10301
11483
|
# Get information on the input data table
|
|
10302
11484
|
tbl_info = _get_tbl_type(data=self.data)
|
|
10303
11485
|
|
|
11486
|
+
# If the table is a Polars one, determine if it's a LazyFrame
|
|
11487
|
+
if tbl_info == "polars":
|
|
11488
|
+
if _is_lazy_frame(self.data):
|
|
11489
|
+
tbl_info = "polars-lazy"
|
|
11490
|
+
|
|
11491
|
+
# Determine if the input table is a Narwhals DF
|
|
11492
|
+
if _is_narwhals_table(self.data):
|
|
11493
|
+
# Determine if the Narwhals table is a LazyFrame
|
|
11494
|
+
if _is_lazy_frame(self.data):
|
|
11495
|
+
tbl_info = "narwhals-lazy"
|
|
11496
|
+
else:
|
|
11497
|
+
tbl_info = "narwhals"
|
|
11498
|
+
|
|
10304
11499
|
# Get the thresholds object
|
|
10305
11500
|
thresholds = self.thresholds
|
|
10306
11501
|
|
|
@@ -10353,7 +11548,9 @@ class Validate:
|
|
|
10353
11548
|
# Create the label, table type, and thresholds HTML fragments
|
|
10354
11549
|
label_html = _create_label_html(label=self.label, start_time="")
|
|
10355
11550
|
table_type_html = _create_table_type_html(tbl_type=tbl_info, tbl_name=self.tbl_name)
|
|
10356
|
-
thresholds_html = _create_thresholds_html(
|
|
11551
|
+
thresholds_html = _create_thresholds_html(
|
|
11552
|
+
thresholds=thresholds, locale=locale, df_lib=df_lib
|
|
11553
|
+
)
|
|
10357
11554
|
|
|
10358
11555
|
# Compose the subtitle HTML fragment
|
|
10359
11556
|
combined_subtitle = (
|
|
@@ -10666,6 +11863,7 @@ class Validate:
|
|
|
10666
11863
|
interrogation_performed=interrogation_performed,
|
|
10667
11864
|
active=active,
|
|
10668
11865
|
locale=locale,
|
|
11866
|
+
df_lib=df_lib,
|
|
10669
11867
|
)
|
|
10670
11868
|
|
|
10671
11869
|
# ------------------------------------------------
|
|
@@ -10682,6 +11880,7 @@ class Validate:
|
|
|
10682
11880
|
interrogation_performed=interrogation_performed,
|
|
10683
11881
|
active=active,
|
|
10684
11882
|
locale=locale,
|
|
11883
|
+
df_lib=df_lib,
|
|
10685
11884
|
)
|
|
10686
11885
|
|
|
10687
11886
|
validation_info_dict["fail"] = _transform_passed_failed(
|
|
@@ -10690,6 +11889,7 @@ class Validate:
|
|
|
10690
11889
|
interrogation_performed=interrogation_performed,
|
|
10691
11890
|
active=active,
|
|
10692
11891
|
locale=locale,
|
|
11892
|
+
df_lib=df_lib,
|
|
10693
11893
|
)
|
|
10694
11894
|
|
|
10695
11895
|
# ------------------------------------------------
|
|
@@ -10869,7 +12069,9 @@ class Validate:
|
|
|
10869
12069
|
# Create the label, table type, and thresholds HTML fragments
|
|
10870
12070
|
label_html = _create_label_html(label=self.label, start_time=self.time_start)
|
|
10871
12071
|
table_type_html = _create_table_type_html(tbl_type=tbl_info, tbl_name=self.tbl_name)
|
|
10872
|
-
thresholds_html = _create_thresholds_html(
|
|
12072
|
+
thresholds_html = _create_thresholds_html(
|
|
12073
|
+
thresholds=thresholds, locale=locale, df_lib=df_lib
|
|
12074
|
+
)
|
|
10873
12075
|
|
|
10874
12076
|
# Compose the subtitle HTML fragment
|
|
10875
12077
|
combined_subtitle = (
|
|
@@ -11127,24 +12329,25 @@ class Validate:
|
|
|
11127
12329
|
Types of Step Reports
|
|
11128
12330
|
---------------------
|
|
11129
12331
|
The `get_step_report()` method produces a report based on the *type* of validation step.
|
|
11130
|
-
The following row-based validation methods will produce a
|
|
11131
|
-
|
|
12332
|
+
The following column-value or row-based validation step validation methods will produce a
|
|
12333
|
+
report that shows the rows of the data that failed:
|
|
11132
12334
|
|
|
11133
12335
|
- [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`)
|
|
12336
|
+
- [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
|
|
11134
12337
|
- [`col_vals_lt()`](`pointblank.Validate.col_vals_lt`)
|
|
12338
|
+
- [`col_vals_le()`](`pointblank.Validate.col_vals_le`)
|
|
11135
12339
|
- [`col_vals_eq()`](`pointblank.Validate.col_vals_eq`)
|
|
11136
12340
|
- [`col_vals_ne()`](`pointblank.Validate.col_vals_ne`)
|
|
11137
|
-
- [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
|
|
11138
|
-
- [`col_vals_le()`](`pointblank.Validate.col_vals_le`)
|
|
11139
12341
|
- [`col_vals_between()`](`pointblank.Validate.col_vals_between`)
|
|
11140
12342
|
- [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
|
|
11141
12343
|
- [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
|
|
11142
12344
|
- [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
|
|
11143
|
-
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
11144
12345
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
11145
12346
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
11146
|
-
- [`
|
|
12347
|
+
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
12348
|
+
- [`col_vals_expr()`](`pointblank.Validate.col_vals_expr`)
|
|
11147
12349
|
- [`conjointly()`](`pointblank.Validate.conjointly`)
|
|
12350
|
+
- [`rows_complete()`](`pointblank.Validate.rows_complete`)
|
|
11148
12351
|
|
|
11149
12352
|
The [`rows_distinct()`](`pointblank.Validate.rows_distinct`) validation step will produce a
|
|
11150
12353
|
report that shows duplicate rows (or duplicate values in one or a set of columns as defined
|
|
@@ -12671,20 +13874,78 @@ def _transform_eval(
|
|
|
12671
13874
|
return symbol_list
|
|
12672
13875
|
|
|
12673
13876
|
|
|
13877
|
+
def _format_numbers_with_gt(
|
|
13878
|
+
values: list[int], n_sigfig: int = 3, compact: bool = True, locale: str = "en"
|
|
13879
|
+
) -> list[str]:
|
|
13880
|
+
"""Format numbers using Great Tables GT object to avoid pandas dependency."""
|
|
13881
|
+
import polars as pl
|
|
13882
|
+
|
|
13883
|
+
# Create a single-column DataFrame with all values
|
|
13884
|
+
df = pl.DataFrame({"values": values})
|
|
13885
|
+
|
|
13886
|
+
# Create GT object and format the column
|
|
13887
|
+
gt_obj = GT(df).fmt_number(columns="values", n_sigfig=n_sigfig, compact=compact, locale=locale)
|
|
13888
|
+
|
|
13889
|
+
# Extract the formatted values using _get_column_of_values
|
|
13890
|
+
formatted_values = _get_column_of_values(gt_obj, column_name="values", context="html")
|
|
13891
|
+
|
|
13892
|
+
return formatted_values
|
|
13893
|
+
|
|
13894
|
+
|
|
13895
|
+
def _format_single_number_with_gt(
|
|
13896
|
+
value: int, n_sigfig: int = 3, compact: bool = True, locale: str = "en", df_lib=None
|
|
13897
|
+
) -> str:
|
|
13898
|
+
"""Format a single number using Great Tables GT object to avoid pandas dependency."""
|
|
13899
|
+
if df_lib is None:
|
|
13900
|
+
# Use library detection to select appropriate DataFrame library
|
|
13901
|
+
if _is_lib_present("polars"):
|
|
13902
|
+
import polars as pl
|
|
13903
|
+
|
|
13904
|
+
df_lib = pl
|
|
13905
|
+
elif _is_lib_present("pandas"):
|
|
13906
|
+
import pandas as pd
|
|
13907
|
+
|
|
13908
|
+
df_lib = pd
|
|
13909
|
+
else:
|
|
13910
|
+
raise ImportError("Neither Polars nor Pandas is available for formatting")
|
|
13911
|
+
|
|
13912
|
+
# Create a single-row, single-column DataFrame using the specified library
|
|
13913
|
+
df = df_lib.DataFrame({"value": [value]})
|
|
13914
|
+
|
|
13915
|
+
# Create GT object and format the column
|
|
13916
|
+
gt_obj = GT(df).fmt_number(columns="value", n_sigfig=n_sigfig, compact=compact, locale=locale)
|
|
13917
|
+
|
|
13918
|
+
# Extract the formatted value using _get_column_of_values
|
|
13919
|
+
formatted_values = _get_column_of_values(gt_obj, column_name="value", context="html")
|
|
13920
|
+
|
|
13921
|
+
return formatted_values[0] # Return the single formatted value
|
|
13922
|
+
|
|
13923
|
+
|
|
12674
13924
|
def _transform_test_units(
|
|
12675
|
-
test_units: list[int],
|
|
13925
|
+
test_units: list[int],
|
|
13926
|
+
interrogation_performed: bool,
|
|
13927
|
+
active: list[bool],
|
|
13928
|
+
locale: str,
|
|
13929
|
+
df_lib=None,
|
|
12676
13930
|
) -> list[str]:
|
|
12677
13931
|
# If no interrogation was performed, return a list of empty strings
|
|
12678
13932
|
if not interrogation_performed:
|
|
12679
13933
|
return ["" for _ in range(len(test_units))]
|
|
12680
13934
|
|
|
13935
|
+
# Define the helper function that'll format numbers safely with Great Tables
|
|
13936
|
+
def _format_number_safe(value: int) -> str:
|
|
13937
|
+
if df_lib is not None:
|
|
13938
|
+
# Use GT-based formatting to avoid Pandas dependency completely
|
|
13939
|
+
return _format_single_number_with_gt(
|
|
13940
|
+
value, n_sigfig=3, compact=True, locale=locale, df_lib=df_lib
|
|
13941
|
+
)
|
|
13942
|
+
else:
|
|
13943
|
+
# Fallback to the original behavior
|
|
13944
|
+
return str(vals.fmt_number(value, n_sigfig=3, compact=True, locale=locale)[0])
|
|
13945
|
+
|
|
12681
13946
|
return [
|
|
12682
13947
|
(
|
|
12683
|
-
(
|
|
12684
|
-
str(test_units[i])
|
|
12685
|
-
if test_units[i] < 10000
|
|
12686
|
-
else str(vals.fmt_number(test_units[i], n_sigfig=3, compact=True, locale=locale)[0])
|
|
12687
|
-
)
|
|
13948
|
+
(str(test_units[i]) if test_units[i] < 10000 else _format_number_safe(test_units[i]))
|
|
12688
13949
|
if active[i]
|
|
12689
13950
|
else "—"
|
|
12690
13951
|
)
|
|
@@ -12692,8 +13953,43 @@ def _transform_test_units(
|
|
|
12692
13953
|
]
|
|
12693
13954
|
|
|
12694
13955
|
|
|
12695
|
-
def _fmt_lg(value: int, locale: str) -> str:
|
|
12696
|
-
|
|
13956
|
+
def _fmt_lg(value: int, locale: str, df_lib=None) -> str:
|
|
13957
|
+
if df_lib is not None:
|
|
13958
|
+
# Use GT-based formatting if a DataFrame library is provided
|
|
13959
|
+
return _format_single_number_with_gt(
|
|
13960
|
+
value, n_sigfig=3, compact=True, locale=locale, df_lib=df_lib
|
|
13961
|
+
)
|
|
13962
|
+
else:
|
|
13963
|
+
# Fallback to the original behavior
|
|
13964
|
+
return vals.fmt_number(value, n_sigfig=3, compact=True, locale=locale)[0]
|
|
13965
|
+
|
|
13966
|
+
|
|
13967
|
+
def _format_single_float_with_gt(
|
|
13968
|
+
value: float, decimals: int = 2, locale: str = "en", df_lib=None
|
|
13969
|
+
) -> str:
|
|
13970
|
+
if df_lib is None:
|
|
13971
|
+
# Use library detection to select appropriate DataFrame library
|
|
13972
|
+
if _is_lib_present("polars"):
|
|
13973
|
+
import polars as pl
|
|
13974
|
+
|
|
13975
|
+
df_lib = pl
|
|
13976
|
+
elif _is_lib_present("pandas"):
|
|
13977
|
+
import pandas as pd
|
|
13978
|
+
|
|
13979
|
+
df_lib = pd
|
|
13980
|
+
else:
|
|
13981
|
+
raise ImportError("Neither Polars nor Pandas is available for formatting")
|
|
13982
|
+
|
|
13983
|
+
# Create a single-row, single-column DataFrame using the specified library
|
|
13984
|
+
df = df_lib.DataFrame({"value": [value]})
|
|
13985
|
+
|
|
13986
|
+
# Create GT object and format the column
|
|
13987
|
+
gt_obj = GT(df).fmt_number(columns="value", decimals=decimals, locale=locale)
|
|
13988
|
+
|
|
13989
|
+
# Extract the formatted value using _get_column_of_values
|
|
13990
|
+
formatted_values = _get_column_of_values(gt_obj, column_name="value", context="html")
|
|
13991
|
+
|
|
13992
|
+
return formatted_values[0] # Return the single formatted value
|
|
12697
13993
|
|
|
12698
13994
|
|
|
12699
13995
|
def _transform_passed_failed(
|
|
@@ -12702,14 +13998,24 @@ def _transform_passed_failed(
|
|
|
12702
13998
|
interrogation_performed: bool,
|
|
12703
13999
|
active: list[bool],
|
|
12704
14000
|
locale: str,
|
|
14001
|
+
df_lib=None,
|
|
12705
14002
|
) -> list[str]:
|
|
12706
14003
|
if not interrogation_performed:
|
|
12707
14004
|
return ["" for _ in range(len(n_passed_failed))]
|
|
12708
14005
|
|
|
14006
|
+
# Helper function to format numbers safely
|
|
14007
|
+
def _format_float_safe(value: float) -> str:
|
|
14008
|
+
if df_lib is not None:
|
|
14009
|
+
# Use GT-based formatting to avoid Pandas dependency completely
|
|
14010
|
+
return _format_single_float_with_gt(value, decimals=2, locale=locale, df_lib=df_lib)
|
|
14011
|
+
else:
|
|
14012
|
+
# Fallback to the original behavior
|
|
14013
|
+
return vals.fmt_number(value, decimals=2, locale=locale)[0]
|
|
14014
|
+
|
|
12709
14015
|
passed_failed = [
|
|
12710
14016
|
(
|
|
12711
|
-
f"{n_passed_failed[i] if n_passed_failed[i] < 10000 else _fmt_lg(n_passed_failed[i], locale=locale)}"
|
|
12712
|
-
f"<br />{
|
|
14017
|
+
f"{n_passed_failed[i] if n_passed_failed[i] < 10000 else _fmt_lg(n_passed_failed[i], locale=locale, df_lib=df_lib)}"
|
|
14018
|
+
f"<br />{_format_float_safe(f_passed_failed[i])}"
|
|
12713
14019
|
if active[i]
|
|
12714
14020
|
else "—"
|
|
12715
14021
|
)
|
|
@@ -12920,41 +14226,122 @@ def _create_label_html(label: str | None, start_time: str) -> str:
|
|
|
12920
14226
|
)
|
|
12921
14227
|
|
|
12922
14228
|
|
|
12923
|
-
def
|
|
14229
|
+
def _format_single_integer_with_gt(value: int, locale: str = "en", df_lib=None) -> str:
|
|
14230
|
+
"""Format a single integer using Great Tables GT object to avoid pandas dependency."""
|
|
14231
|
+
if df_lib is None:
|
|
14232
|
+
# Use library detection to select appropriate DataFrame library
|
|
14233
|
+
if _is_lib_present("polars"):
|
|
14234
|
+
import polars as pl
|
|
14235
|
+
|
|
14236
|
+
df_lib = pl
|
|
14237
|
+
elif _is_lib_present("pandas"):
|
|
14238
|
+
import pandas as pd
|
|
14239
|
+
|
|
14240
|
+
df_lib = pd
|
|
14241
|
+
else:
|
|
14242
|
+
raise ImportError("Neither Polars nor Pandas is available for formatting")
|
|
14243
|
+
|
|
14244
|
+
# Create a single-row, single-column DataFrame using the specified library
|
|
14245
|
+
df = df_lib.DataFrame({"value": [value]})
|
|
14246
|
+
|
|
14247
|
+
# Create GT object and format the column
|
|
14248
|
+
gt_obj = GT(df).fmt_integer(columns="value", locale=locale)
|
|
14249
|
+
|
|
14250
|
+
# Extract the formatted value using _get_column_of_values
|
|
14251
|
+
formatted_values = _get_column_of_values(gt_obj, column_name="value", context="html")
|
|
14252
|
+
|
|
14253
|
+
return formatted_values[0] # Return the single formatted value
|
|
14254
|
+
|
|
14255
|
+
|
|
14256
|
+
def _format_single_float_with_gt_custom(
|
|
14257
|
+
value: float,
|
|
14258
|
+
decimals: int = 2,
|
|
14259
|
+
drop_trailing_zeros: bool = False,
|
|
14260
|
+
locale: str = "en",
|
|
14261
|
+
df_lib=None,
|
|
14262
|
+
) -> str:
|
|
14263
|
+
"""Format a single float with custom options using Great Tables GT object to avoid pandas dependency."""
|
|
14264
|
+
if df_lib is None:
|
|
14265
|
+
# Use library detection to select appropriate DataFrame library
|
|
14266
|
+
if _is_lib_present("polars"):
|
|
14267
|
+
import polars as pl
|
|
14268
|
+
|
|
14269
|
+
df_lib = pl
|
|
14270
|
+
elif _is_lib_present("pandas"):
|
|
14271
|
+
import pandas as pd
|
|
14272
|
+
|
|
14273
|
+
df_lib = pd
|
|
14274
|
+
else:
|
|
14275
|
+
raise ImportError("Neither Polars nor Pandas is available for formatting")
|
|
14276
|
+
|
|
14277
|
+
# Create a single-row, single-column DataFrame using the specified library
|
|
14278
|
+
df = df_lib.DataFrame({"value": [value]})
|
|
14279
|
+
|
|
14280
|
+
# Create GT object and format the column
|
|
14281
|
+
gt_obj = GT(df).fmt_number(
|
|
14282
|
+
columns="value", decimals=decimals, drop_trailing_zeros=drop_trailing_zeros, locale=locale
|
|
14283
|
+
)
|
|
14284
|
+
|
|
14285
|
+
# Extract the formatted value using _get_column_of_values
|
|
14286
|
+
formatted_values = _get_column_of_values(gt_obj, column_name="value", context="html")
|
|
14287
|
+
|
|
14288
|
+
return formatted_values[0] # Return the single formatted value
|
|
14289
|
+
|
|
14290
|
+
|
|
14291
|
+
def _create_thresholds_html(thresholds: Thresholds, locale: str, df_lib=None) -> str:
|
|
12924
14292
|
if thresholds == Thresholds():
|
|
12925
14293
|
return ""
|
|
12926
14294
|
|
|
14295
|
+
# Helper functions to format numbers safely
|
|
14296
|
+
def _format_number_safe(value: float, decimals: int, drop_trailing_zeros: bool = False) -> str:
|
|
14297
|
+
if df_lib is not None and value is not None:
|
|
14298
|
+
# Use GT-based formatting to avoid Pandas dependency completely
|
|
14299
|
+
return _format_single_float_with_gt_custom(
|
|
14300
|
+
value,
|
|
14301
|
+
decimals=decimals,
|
|
14302
|
+
drop_trailing_zeros=drop_trailing_zeros,
|
|
14303
|
+
locale=locale,
|
|
14304
|
+
df_lib=df_lib,
|
|
14305
|
+
)
|
|
14306
|
+
else:
|
|
14307
|
+
# Fallback to the original behavior
|
|
14308
|
+
return fmt_number(
|
|
14309
|
+
value, decimals=decimals, drop_trailing_zeros=drop_trailing_zeros, locale=locale
|
|
14310
|
+
)[0]
|
|
14311
|
+
|
|
14312
|
+
def _format_integer_safe(value: int) -> str:
|
|
14313
|
+
if df_lib is not None and value is not None:
|
|
14314
|
+
# Use GT-based formatting to avoid Pandas dependency completely
|
|
14315
|
+
return _format_single_integer_with_gt(value, locale=locale, df_lib=df_lib)
|
|
14316
|
+
else:
|
|
14317
|
+
# Fallback to the original behavior
|
|
14318
|
+
return fmt_integer(value, locale=locale)[0]
|
|
14319
|
+
|
|
12927
14320
|
warning = (
|
|
12928
|
-
|
|
12929
|
-
thresholds.warning_fraction, decimals=3, drop_trailing_zeros=True, locale=locale
|
|
12930
|
-
)[0]
|
|
14321
|
+
_format_number_safe(thresholds.warning_fraction, decimals=3, drop_trailing_zeros=True)
|
|
12931
14322
|
if thresholds.warning_fraction is not None
|
|
12932
14323
|
else (
|
|
12933
|
-
|
|
14324
|
+
_format_integer_safe(thresholds.warning_count)
|
|
12934
14325
|
if thresholds.warning_count is not None
|
|
12935
14326
|
else "—"
|
|
12936
14327
|
)
|
|
12937
14328
|
)
|
|
12938
14329
|
|
|
12939
14330
|
error = (
|
|
12940
|
-
|
|
12941
|
-
0
|
|
12942
|
-
]
|
|
14331
|
+
_format_number_safe(thresholds.error_fraction, decimals=3, drop_trailing_zeros=True)
|
|
12943
14332
|
if thresholds.error_fraction is not None
|
|
12944
14333
|
else (
|
|
12945
|
-
|
|
14334
|
+
_format_integer_safe(thresholds.error_count)
|
|
12946
14335
|
if thresholds.error_count is not None
|
|
12947
14336
|
else "—"
|
|
12948
14337
|
)
|
|
12949
14338
|
)
|
|
12950
14339
|
|
|
12951
14340
|
critical = (
|
|
12952
|
-
|
|
12953
|
-
thresholds.critical_fraction, decimals=3, drop_trailing_zeros=True, locale=locale
|
|
12954
|
-
)[0]
|
|
14341
|
+
_format_number_safe(thresholds.critical_fraction, decimals=3, drop_trailing_zeros=True)
|
|
12955
14342
|
if thresholds.critical_fraction is not None
|
|
12956
14343
|
else (
|
|
12957
|
-
|
|
14344
|
+
_format_integer_safe(thresholds.critical_count)
|
|
12958
14345
|
if thresholds.critical_count is not None
|
|
12959
14346
|
else "—"
|
|
12960
14347
|
)
|