pointblank 0.9.6__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +4 -0
- pointblank/_constants.py +4 -0
- pointblank/_datascan_utils.py +65 -0
- pointblank/_utils.py +126 -0
- pointblank/_utils_html.py +40 -0
- pointblank/assistant.py +1 -3
- pointblank/cli.py +2737 -0
- pointblank/compare.py +27 -0
- pointblank/data/api-docs.txt +518 -125
- pointblank/datascan.py +318 -959
- pointblank/scan_profile.py +321 -0
- pointblank/scan_profile_stats.py +180 -0
- pointblank/schema.py +14 -3
- pointblank/validate.py +1425 -202
- {pointblank-0.9.6.dist-info → pointblank-0.11.0.dist-info}/METADATA +49 -3
- {pointblank-0.9.6.dist-info → pointblank-0.11.0.dist-info}/RECORD +20 -14
- {pointblank-0.9.6.dist-info → pointblank-0.11.0.dist-info}/WHEEL +1 -1
- pointblank-0.11.0.dist-info/entry_points.txt +2 -0
- {pointblank-0.9.6.dist-info → pointblank-0.11.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.9.6.dist-info → pointblank-0.11.0.dist-info}/top_level.txt +0 -0
pointblank/validate.py
CHANGED
|
@@ -17,6 +17,7 @@ from zipfile import ZipFile
|
|
|
17
17
|
import commonmark
|
|
18
18
|
import narwhals as nw
|
|
19
19
|
from great_tables import GT, from_column, google_font, html, loc, md, style, vals
|
|
20
|
+
from great_tables.gt import _get_column_of_values
|
|
20
21
|
from great_tables.vals import fmt_integer, fmt_number
|
|
21
22
|
from importlib_resources import files
|
|
22
23
|
from narwhals.typing import FrameT
|
|
@@ -64,11 +65,15 @@ from pointblank._typing import SegmentSpec
|
|
|
64
65
|
from pointblank._utils import (
|
|
65
66
|
_check_any_df_lib,
|
|
66
67
|
_check_invalid_fields,
|
|
68
|
+
_count_null_values_in_column,
|
|
69
|
+
_count_true_values_in_column,
|
|
67
70
|
_derive_bounds,
|
|
68
71
|
_format_to_integer_value,
|
|
69
72
|
_get_fn_name,
|
|
70
73
|
_get_tbl_type,
|
|
74
|
+
_is_lazy_frame,
|
|
71
75
|
_is_lib_present,
|
|
76
|
+
_is_narwhals_table,
|
|
72
77
|
_is_value_a_df,
|
|
73
78
|
_select_df_lib,
|
|
74
79
|
)
|
|
@@ -99,11 +104,13 @@ __all__ = [
|
|
|
99
104
|
"Validate",
|
|
100
105
|
"load_dataset",
|
|
101
106
|
"config",
|
|
107
|
+
"connect_to_table",
|
|
102
108
|
"preview",
|
|
103
109
|
"missing_vals_tbl",
|
|
110
|
+
"get_action_metadata",
|
|
104
111
|
"get_column_count",
|
|
112
|
+
"get_data_path",
|
|
105
113
|
"get_row_count",
|
|
106
|
-
"get_action_metadata",
|
|
107
114
|
"get_validation_summary",
|
|
108
115
|
]
|
|
109
116
|
|
|
@@ -495,7 +502,9 @@ def load_dataset(
|
|
|
495
502
|
raise ValueError(
|
|
496
503
|
f"The dataset name `{dataset}` is not valid. Choose one of the following:\n"
|
|
497
504
|
"- `small_table`\n"
|
|
498
|
-
"- `game_revenue
|
|
505
|
+
"- `game_revenue`\n"
|
|
506
|
+
"- `nycflights`\n"
|
|
507
|
+
"- `global_sales`"
|
|
499
508
|
)
|
|
500
509
|
|
|
501
510
|
# Raise an error if the `tbl_type=` value is not of the supported types
|
|
@@ -560,6 +569,405 @@ def load_dataset(
|
|
|
560
569
|
return dataset
|
|
561
570
|
|
|
562
571
|
|
|
572
|
+
def get_data_path(
|
|
573
|
+
dataset: Literal["small_table", "game_revenue", "nycflights", "global_sales"] = "small_table",
|
|
574
|
+
file_type: Literal["csv", "parquet", "duckdb"] = "csv",
|
|
575
|
+
) -> str:
|
|
576
|
+
"""
|
|
577
|
+
Get the file path to a dataset included with the Pointblank package.
|
|
578
|
+
|
|
579
|
+
This function provides direct access to the file paths of datasets included with Pointblank.
|
|
580
|
+
These paths can be used in examples and documentation to demonstrate file-based data loading
|
|
581
|
+
without requiring the actual data files. The returned paths can be used with
|
|
582
|
+
`Validate(data=path)` to demonstrate CSV and Parquet file loading capabilities.
|
|
583
|
+
|
|
584
|
+
Parameters
|
|
585
|
+
----------
|
|
586
|
+
dataset
|
|
587
|
+
The name of the dataset to get the path for. Current options are `"small_table"`,
|
|
588
|
+
`"game_revenue"`, `"nycflights"`, and `"global_sales"`.
|
|
589
|
+
file_type
|
|
590
|
+
The file format to get the path for. Options are `"csv"`, `"parquet"`, or `"duckdb"`.
|
|
591
|
+
|
|
592
|
+
Returns
|
|
593
|
+
-------
|
|
594
|
+
str
|
|
595
|
+
The file path to the requested dataset file.
|
|
596
|
+
|
|
597
|
+
Included Datasets
|
|
598
|
+
-----------------
|
|
599
|
+
The available datasets are the same as those in [`load_dataset()`](`pointblank.load_dataset`):
|
|
600
|
+
|
|
601
|
+
- `"small_table"`: A small dataset with 13 rows and 8 columns. Ideal for testing and examples.
|
|
602
|
+
- `"game_revenue"`: A dataset with 2000 rows and 11 columns. Revenue data for a game company.
|
|
603
|
+
- `"nycflights"`: A dataset with 336,776 rows and 18 columns. Flight data from NYC airports.
|
|
604
|
+
- `"global_sales"`: A dataset with 50,000 rows and 20 columns. Global sales data across regions.
|
|
605
|
+
|
|
606
|
+
File Types
|
|
607
|
+
----------
|
|
608
|
+
Each dataset is available in multiple formats:
|
|
609
|
+
|
|
610
|
+
- `"csv"`: Comma-separated values file (`.csv`)
|
|
611
|
+
- `"parquet"`: Parquet file (`.parquet`)
|
|
612
|
+
- `"duckdb"`: DuckDB database file (`.ddb`)
|
|
613
|
+
|
|
614
|
+
Examples
|
|
615
|
+
--------
|
|
616
|
+
Get the path to a CSV file and use it with `Validate`:
|
|
617
|
+
|
|
618
|
+
```{python}
|
|
619
|
+
import pointblank as pb
|
|
620
|
+
|
|
621
|
+
# Get path to the small_table CSV file
|
|
622
|
+
csv_path = pb.get_data_path("small_table", "csv")
|
|
623
|
+
print(csv_path)
|
|
624
|
+
|
|
625
|
+
# Use the path directly with Validate
|
|
626
|
+
validation = (
|
|
627
|
+
pb.Validate(data=csv_path)
|
|
628
|
+
.col_exists(["a", "b", "c"])
|
|
629
|
+
.col_vals_gt(columns="d", value=0)
|
|
630
|
+
.interrogate()
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
validation
|
|
634
|
+
```
|
|
635
|
+
|
|
636
|
+
Get a Parquet file path for validation examples:
|
|
637
|
+
|
|
638
|
+
```{python}
|
|
639
|
+
# Get path to the game_revenue Parquet file
|
|
640
|
+
parquet_path = pb.get_data_path(dataset="game_revenue", file_type="parquet")
|
|
641
|
+
|
|
642
|
+
# Validate the Parquet file directly
|
|
643
|
+
validation = (
|
|
644
|
+
pb.Validate(data=parquet_path, label="Game Revenue Data Validation")
|
|
645
|
+
.col_vals_not_null(columns=["player_id", "session_id"])
|
|
646
|
+
.col_vals_gt(columns="item_revenue", value=0)
|
|
647
|
+
.interrogate()
|
|
648
|
+
)
|
|
649
|
+
|
|
650
|
+
validation
|
|
651
|
+
```
|
|
652
|
+
|
|
653
|
+
This is particularly useful for documentation examples where you want to demonstrate
|
|
654
|
+
file-based workflows without requiring users to have specific data files:
|
|
655
|
+
|
|
656
|
+
```{python}
|
|
657
|
+
# Example showing CSV file validation
|
|
658
|
+
sales_csv = pb.get_data_path(dataset="global_sales", file_type="csv")
|
|
659
|
+
|
|
660
|
+
validation = (
|
|
661
|
+
pb.Validate(data=sales_csv, label="Sales Data Validation")
|
|
662
|
+
.col_exists(["customer_id", "product_id", "amount"])
|
|
663
|
+
.col_vals_regex(columns="customer_id", pattern=r"CUST_[0-9]{6}")
|
|
664
|
+
.interrogate()
|
|
665
|
+
)
|
|
666
|
+
```
|
|
667
|
+
|
|
668
|
+
See Also
|
|
669
|
+
--------
|
|
670
|
+
[`load_dataset()`](`pointblank.load_dataset`) for loading datasets directly as table objects.
|
|
671
|
+
"""
|
|
672
|
+
|
|
673
|
+
# Validate inputs
|
|
674
|
+
if dataset not in ["small_table", "game_revenue", "nycflights", "global_sales"]:
|
|
675
|
+
raise ValueError(
|
|
676
|
+
f"The dataset name `{dataset}` is not valid. Choose one of the following:\n"
|
|
677
|
+
"- `small_table`\n"
|
|
678
|
+
"- `game_revenue`\n"
|
|
679
|
+
"- `nycflights`\n"
|
|
680
|
+
"- `global_sales`"
|
|
681
|
+
)
|
|
682
|
+
|
|
683
|
+
if file_type not in ["csv", "parquet", "duckdb"]:
|
|
684
|
+
raise ValueError(
|
|
685
|
+
f"The file type `{file_type}` is not valid. Choose one of the following:\n"
|
|
686
|
+
"- `csv`\n"
|
|
687
|
+
"- `parquet`\n"
|
|
688
|
+
"- `duckdb`"
|
|
689
|
+
)
|
|
690
|
+
|
|
691
|
+
if file_type == "csv":
|
|
692
|
+
# Return path to CSV file inside the zip
|
|
693
|
+
data_path = files("pointblank.data") / f"{dataset}.zip"
|
|
694
|
+
|
|
695
|
+
# For CSV files, we need to extract from zip to a temporary location
|
|
696
|
+
# since most libraries expect actual file paths, not zip contents
|
|
697
|
+
with tempfile.NamedTemporaryFile(mode="wb", suffix=".csv", delete=False) as tmp_file:
|
|
698
|
+
with ZipFile(data_path) as zip_file:
|
|
699
|
+
csv_content = zip_file.read(f"{dataset}.csv")
|
|
700
|
+
tmp_file.write(csv_content)
|
|
701
|
+
return tmp_file.name
|
|
702
|
+
|
|
703
|
+
elif file_type == "parquet":
|
|
704
|
+
# Create a temporary parquet file from the CSV data
|
|
705
|
+
data_path = files("pointblank.data") / f"{dataset}.zip"
|
|
706
|
+
|
|
707
|
+
# We'll need to convert CSV to Parquet temporarily
|
|
708
|
+
with tempfile.NamedTemporaryFile(mode="wb", suffix=".parquet", delete=False) as tmp_file:
|
|
709
|
+
# Load CSV data and save as Parquet
|
|
710
|
+
if _is_lib_present(lib_name="polars"):
|
|
711
|
+
import polars as pl
|
|
712
|
+
|
|
713
|
+
df = pl.read_csv(ZipFile(data_path).read(f"{dataset}.csv"), try_parse_dates=True)
|
|
714
|
+
df.write_parquet(tmp_file.name)
|
|
715
|
+
elif _is_lib_present(lib_name="pandas"):
|
|
716
|
+
import pandas as pd
|
|
717
|
+
|
|
718
|
+
df = pd.read_csv(data_path)
|
|
719
|
+
df.to_parquet(tmp_file.name, index=False)
|
|
720
|
+
else:
|
|
721
|
+
raise ImportError(
|
|
722
|
+
"Either Polars or Pandas is required to create temporary Parquet files."
|
|
723
|
+
)
|
|
724
|
+
return tmp_file.name
|
|
725
|
+
|
|
726
|
+
elif file_type == "duckdb":
|
|
727
|
+
# Return path to DuckDB file
|
|
728
|
+
data_path = files("pointblank.data") / f"{dataset}-duckdb.zip"
|
|
729
|
+
|
|
730
|
+
# Extract DuckDB file to temporary location
|
|
731
|
+
with tempfile.NamedTemporaryFile(mode="wb", suffix=".ddb", delete=False) as tmp_file:
|
|
732
|
+
with ZipFile(data_path) as zip_file:
|
|
733
|
+
ddb_content = zip_file.read(f"{dataset}.ddb")
|
|
734
|
+
tmp_file.write(ddb_content)
|
|
735
|
+
return tmp_file.name
|
|
736
|
+
|
|
737
|
+
|
|
738
|
+
# =============================================================================
|
|
739
|
+
# Utility functions for processing input data (shared by preview() and Validate class)
|
|
740
|
+
# =============================================================================
|
|
741
|
+
|
|
742
|
+
|
|
743
|
+
def _process_connection_string(data: FrameT | Any) -> FrameT | Any:
|
|
744
|
+
"""
|
|
745
|
+
Process data parameter to handle database connection strings.
|
|
746
|
+
|
|
747
|
+
Uses the `connect_to_table()` utility function to handle URI-formatted connection strings with
|
|
748
|
+
table specifications. Returns the original data if it's not a connection string.
|
|
749
|
+
|
|
750
|
+
For more details on supported connection string formats, see the documentation
|
|
751
|
+
for `connect_to_table()`.
|
|
752
|
+
"""
|
|
753
|
+
# Check if data is a string that looks like a connection string
|
|
754
|
+
if not isinstance(data, str):
|
|
755
|
+
return data
|
|
756
|
+
|
|
757
|
+
# Basic connection string patterns
|
|
758
|
+
connection_patterns = [
|
|
759
|
+
"://", # General URL-like pattern
|
|
760
|
+
]
|
|
761
|
+
|
|
762
|
+
# Check if it looks like a connection string
|
|
763
|
+
if not any(pattern in data for pattern in connection_patterns):
|
|
764
|
+
return data
|
|
765
|
+
|
|
766
|
+
# Use the utility function to connect to the table
|
|
767
|
+
return connect_to_table(data)
|
|
768
|
+
|
|
769
|
+
|
|
770
|
+
def _process_csv_input(data: FrameT | Any) -> FrameT | Any:
|
|
771
|
+
"""
|
|
772
|
+
Process data parameter to handle CSV file inputs.
|
|
773
|
+
|
|
774
|
+
If data is a string or Path with .csv extension, reads the CSV file
|
|
775
|
+
using available libraries (Polars preferred, then Pandas).
|
|
776
|
+
|
|
777
|
+
Returns the original data if it's not a CSV file path.
|
|
778
|
+
"""
|
|
779
|
+
from pathlib import Path
|
|
780
|
+
|
|
781
|
+
# Check if data is a string or Path-like object with .csv extension
|
|
782
|
+
csv_path = None
|
|
783
|
+
|
|
784
|
+
if isinstance(data, (str, Path)):
|
|
785
|
+
path_obj = Path(data)
|
|
786
|
+
if path_obj.suffix.lower() == ".csv":
|
|
787
|
+
csv_path = path_obj
|
|
788
|
+
|
|
789
|
+
# If it's not a CSV file path, return the original data
|
|
790
|
+
if csv_path is None:
|
|
791
|
+
return data
|
|
792
|
+
|
|
793
|
+
# Check if the CSV file exists
|
|
794
|
+
if not csv_path.exists():
|
|
795
|
+
raise FileNotFoundError(f"CSV file not found: {csv_path}")
|
|
796
|
+
|
|
797
|
+
# Determine which library to use for reading CSV
|
|
798
|
+
# Prefer Polars, fallback to Pandas
|
|
799
|
+
if _is_lib_present(lib_name="polars"):
|
|
800
|
+
try:
|
|
801
|
+
import polars as pl
|
|
802
|
+
|
|
803
|
+
return pl.read_csv(csv_path, try_parse_dates=True)
|
|
804
|
+
except Exception as e:
|
|
805
|
+
# If Polars fails, try Pandas if available
|
|
806
|
+
if _is_lib_present(lib_name="pandas"):
|
|
807
|
+
import pandas as pd
|
|
808
|
+
|
|
809
|
+
return pd.read_csv(csv_path)
|
|
810
|
+
else:
|
|
811
|
+
raise RuntimeError(
|
|
812
|
+
f"Failed to read CSV file with Polars: {e}. "
|
|
813
|
+
"Pandas is not available as fallback."
|
|
814
|
+
) from e
|
|
815
|
+
elif _is_lib_present(lib_name="pandas"):
|
|
816
|
+
try:
|
|
817
|
+
import pandas as pd
|
|
818
|
+
|
|
819
|
+
return pd.read_csv(csv_path)
|
|
820
|
+
except Exception as e:
|
|
821
|
+
raise RuntimeError(f"Failed to read CSV file with Pandas: {e}") from e
|
|
822
|
+
else:
|
|
823
|
+
raise ImportError(
|
|
824
|
+
"Neither Polars nor Pandas is available for reading CSV files. "
|
|
825
|
+
"Please install either 'polars' or 'pandas' to use CSV file inputs."
|
|
826
|
+
)
|
|
827
|
+
|
|
828
|
+
|
|
829
|
+
def _process_parquet_input(data: FrameT | Any) -> FrameT | Any:
|
|
830
|
+
"""
|
|
831
|
+
Process data parameter to handle Parquet file inputs.
|
|
832
|
+
|
|
833
|
+
Supports:
|
|
834
|
+
- single .parquet file (string or Path)
|
|
835
|
+
- glob patterns for multiple .parquet files (e.g., "data/*.parquet")
|
|
836
|
+
- directory containing .parquet files
|
|
837
|
+
- partitioned Parquet datasets with automatic partition column inference
|
|
838
|
+
- list/sequence of .parquet file paths
|
|
839
|
+
|
|
840
|
+
Returns the original data if it's not a Parquet file input.
|
|
841
|
+
"""
|
|
842
|
+
import glob
|
|
843
|
+
from pathlib import Path
|
|
844
|
+
|
|
845
|
+
parquet_paths = []
|
|
846
|
+
|
|
847
|
+
# Handle different input types
|
|
848
|
+
if isinstance(data, (str, Path)):
|
|
849
|
+
data_str = str(data)
|
|
850
|
+
path_obj = Path(data)
|
|
851
|
+
|
|
852
|
+
# Check if it's a glob pattern containing .parquet first; look for glob
|
|
853
|
+
# characters: `*`, `?`, `[`, `]`
|
|
854
|
+
if ".parquet" in data_str.lower() and any(
|
|
855
|
+
char in data_str for char in ["*", "?", "[", "]"]
|
|
856
|
+
):
|
|
857
|
+
parquet_files = glob.glob(data_str)
|
|
858
|
+
if parquet_files:
|
|
859
|
+
parquet_paths = sorted([Path(f) for f in parquet_files])
|
|
860
|
+
else:
|
|
861
|
+
raise FileNotFoundError(f"No files found matching pattern: {data}")
|
|
862
|
+
|
|
863
|
+
# Check if it's a single .parquet file
|
|
864
|
+
elif path_obj.suffix.lower() == ".parquet":
|
|
865
|
+
if path_obj.exists():
|
|
866
|
+
parquet_paths = [path_obj]
|
|
867
|
+
else:
|
|
868
|
+
raise FileNotFoundError(f"Parquet file not found: {path_obj}")
|
|
869
|
+
|
|
870
|
+
# Check if it's a directory
|
|
871
|
+
elif path_obj.is_dir():
|
|
872
|
+
# First, try to read as a partitioned parquet dataset; This handles datasets where
|
|
873
|
+
# Parquet files are in subdirectories with partition columns encoded in paths
|
|
874
|
+
try:
|
|
875
|
+
# Both Polars and Pandas can handle partitioned datasets natively
|
|
876
|
+
if _is_lib_present(lib_name="polars"):
|
|
877
|
+
import polars as pl
|
|
878
|
+
|
|
879
|
+
# Try reading as partitioned dataset first
|
|
880
|
+
df = pl.read_parquet(str(path_obj))
|
|
881
|
+
return df
|
|
882
|
+
elif _is_lib_present(lib_name="pandas"):
|
|
883
|
+
import pandas as pd
|
|
884
|
+
|
|
885
|
+
# Try reading as partitioned dataset first
|
|
886
|
+
df = pd.read_parquet(str(path_obj))
|
|
887
|
+
return df
|
|
888
|
+
except Exception:
|
|
889
|
+
# If partitioned read fails, fall back to simple directory scan
|
|
890
|
+
pass
|
|
891
|
+
|
|
892
|
+
# Fallback: Look for .parquet files directly in the directory
|
|
893
|
+
parquet_files = list(path_obj.glob("*.parquet"))
|
|
894
|
+
if parquet_files:
|
|
895
|
+
parquet_paths = sorted(parquet_files)
|
|
896
|
+
else:
|
|
897
|
+
raise FileNotFoundError(
|
|
898
|
+
f"No .parquet files found in directory: {path_obj}. "
|
|
899
|
+
f"This could be a non-partitioned directory without .parquet files, "
|
|
900
|
+
f"or a partitioned dataset that couldn't be read."
|
|
901
|
+
)
|
|
902
|
+
|
|
903
|
+
# If it's not a parquet file, directory, or glob pattern, return original data
|
|
904
|
+
else:
|
|
905
|
+
return data
|
|
906
|
+
|
|
907
|
+
# Handle list/sequence of paths
|
|
908
|
+
elif isinstance(data, (list, tuple)):
|
|
909
|
+
for item in data:
|
|
910
|
+
item_path = Path(item)
|
|
911
|
+
if item_path.suffix.lower() == ".parquet":
|
|
912
|
+
if item_path.exists():
|
|
913
|
+
parquet_paths.append(item_path)
|
|
914
|
+
else:
|
|
915
|
+
raise FileNotFoundError(f"Parquet file not found: {item_path}")
|
|
916
|
+
else:
|
|
917
|
+
# If any item is not a parquet file, return original data
|
|
918
|
+
return data
|
|
919
|
+
|
|
920
|
+
# If no parquet files found, return original data
|
|
921
|
+
if not parquet_paths:
|
|
922
|
+
return data
|
|
923
|
+
|
|
924
|
+
# Read the parquet file(s) using available libraries; prefer Polars, fallback to Pandas
|
|
925
|
+
if _is_lib_present(lib_name="polars"):
|
|
926
|
+
try:
|
|
927
|
+
import polars as pl
|
|
928
|
+
|
|
929
|
+
if len(parquet_paths) == 1:
|
|
930
|
+
# Single file
|
|
931
|
+
return pl.read_parquet(parquet_paths[0])
|
|
932
|
+
else:
|
|
933
|
+
# Multiple files: concatenate them
|
|
934
|
+
dfs = [pl.read_parquet(path) for path in parquet_paths]
|
|
935
|
+
return pl.concat(dfs, how="vertical_relaxed")
|
|
936
|
+
except Exception as e:
|
|
937
|
+
# If Polars fails, try Pandas if available
|
|
938
|
+
if _is_lib_present(lib_name="pandas"):
|
|
939
|
+
import pandas as pd
|
|
940
|
+
|
|
941
|
+
if len(parquet_paths) == 1:
|
|
942
|
+
return pd.read_parquet(parquet_paths[0])
|
|
943
|
+
else:
|
|
944
|
+
# Multiple files: concatenate them
|
|
945
|
+
dfs = [pd.read_parquet(path) for path in parquet_paths]
|
|
946
|
+
return pd.concat(dfs, ignore_index=True)
|
|
947
|
+
else:
|
|
948
|
+
raise RuntimeError(
|
|
949
|
+
f"Failed to read Parquet file(s) with Polars: {e}. "
|
|
950
|
+
"Pandas is not available as fallback."
|
|
951
|
+
) from e
|
|
952
|
+
elif _is_lib_present(lib_name="pandas"):
|
|
953
|
+
try:
|
|
954
|
+
import pandas as pd
|
|
955
|
+
|
|
956
|
+
if len(parquet_paths) == 1:
|
|
957
|
+
return pd.read_parquet(parquet_paths[0])
|
|
958
|
+
else:
|
|
959
|
+
# Multiple files: concatenate them
|
|
960
|
+
dfs = [pd.read_parquet(path) for path in parquet_paths]
|
|
961
|
+
return pd.concat(dfs, ignore_index=True)
|
|
962
|
+
except Exception as e:
|
|
963
|
+
raise RuntimeError(f"Failed to read Parquet file(s) with Pandas: {e}") from e
|
|
964
|
+
else:
|
|
965
|
+
raise ImportError(
|
|
966
|
+
"Neither Polars nor Pandas is available for reading Parquet files. "
|
|
967
|
+
"Please install either 'polars' or 'pandas' to use Parquet file inputs."
|
|
968
|
+
)
|
|
969
|
+
|
|
970
|
+
|
|
563
971
|
def preview(
|
|
564
972
|
data: FrameT | Any,
|
|
565
973
|
columns_subset: str | list[str] | Column | None = None,
|
|
@@ -590,8 +998,14 @@ def preview(
|
|
|
590
998
|
Parameters
|
|
591
999
|
----------
|
|
592
1000
|
data
|
|
593
|
-
The table to preview, which could be a DataFrame object
|
|
594
|
-
|
|
1001
|
+
The table to preview, which could be a DataFrame object, an Ibis table object, a CSV
|
|
1002
|
+
file path, a Parquet file path, or a database connection string. When providing a CSV or
|
|
1003
|
+
Parquet file path (as a string or `pathlib.Path` object), the file will be automatically
|
|
1004
|
+
loaded using an available DataFrame library (Polars or Pandas). Parquet input also supports
|
|
1005
|
+
glob patterns, directories containing .parquet files, and Spark-style partitioned datasets.
|
|
1006
|
+
Connection strings enable direct database access via Ibis with optional table specification
|
|
1007
|
+
using the `::table_name` suffix. Read the *Supported Input Table Types* section for details
|
|
1008
|
+
on the supported table types.
|
|
595
1009
|
columns_subset
|
|
596
1010
|
The columns to display in the table, by default `None` (all columns are shown). This can
|
|
597
1011
|
be a string, a list of strings, a `Column` object, or a `ColumnSelector` object. The latter
|
|
@@ -642,12 +1056,34 @@ def preview(
|
|
|
642
1056
|
- PySpark table (`"pyspark"`)*
|
|
643
1057
|
- BigQuery table (`"bigquery"`)*
|
|
644
1058
|
- Parquet table (`"parquet"`)*
|
|
1059
|
+
- CSV files (string path or `pathlib.Path` object with `.csv` extension)
|
|
1060
|
+
- Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
|
|
1061
|
+
extension, or partitioned dataset)
|
|
1062
|
+
- Database connection strings (URI format with optional table specification)
|
|
645
1063
|
|
|
646
1064
|
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
|
|
647
1065
|
`ibis.expr.types.relations.Table`). Furthermore, using `preview()` with these types of tables
|
|
648
1066
|
requires the Ibis library (`v9.5.0` or above) to be installed. If the input table is a Polars or
|
|
649
1067
|
Pandas DataFrame, the availability of Ibis is not needed.
|
|
650
1068
|
|
|
1069
|
+
To use a CSV file, ensure that a string or `pathlib.Path` object with a `.csv` extension is
|
|
1070
|
+
provided. The file will be automatically detected and loaded using the best available DataFrame
|
|
1071
|
+
library. The loading preference is Polars first, then Pandas as a fallback.
|
|
1072
|
+
|
|
1073
|
+
Connection strings follow database URL formats and must also specify a table using the
|
|
1074
|
+
`::table_name` suffix. Examples include:
|
|
1075
|
+
|
|
1076
|
+
```
|
|
1077
|
+
"duckdb:///path/to/database.ddb::table_name"
|
|
1078
|
+
"sqlite:///path/to/database.db::table_name"
|
|
1079
|
+
"postgresql://user:password@localhost:5432/database::table_name"
|
|
1080
|
+
"mysql://user:password@localhost:3306/database::table_name"
|
|
1081
|
+
"bigquery://project/dataset::table_name"
|
|
1082
|
+
"snowflake://user:password@account/database/schema::table_name"
|
|
1083
|
+
```
|
|
1084
|
+
|
|
1085
|
+
When using connection strings, the Ibis library with the appropriate backend driver is required.
|
|
1086
|
+
|
|
651
1087
|
Examples
|
|
652
1088
|
--------
|
|
653
1089
|
It's easy to preview a table using the `preview()` function. Here's an example using the
|
|
@@ -714,8 +1150,80 @@ def preview(
|
|
|
714
1150
|
columns_subset=pb.col(pb.starts_with("item") | pb.matches("player"))
|
|
715
1151
|
)
|
|
716
1152
|
```
|
|
1153
|
+
|
|
1154
|
+
### Working with CSV Files
|
|
1155
|
+
|
|
1156
|
+
The `preview()` function can directly accept CSV file paths, making it easy to preview data
|
|
1157
|
+
stored in CSV files without manual loading:
|
|
1158
|
+
|
|
1159
|
+
```{python}
|
|
1160
|
+
# Get a path to a CSV file from the package data
|
|
1161
|
+
csv_path = pb.get_data_path("global_sales", "csv")
|
|
1162
|
+
|
|
1163
|
+
pb.preview(csv_path)
|
|
1164
|
+
```
|
|
1165
|
+
|
|
1166
|
+
You can also use a Path object to specify the CSV file:
|
|
1167
|
+
|
|
1168
|
+
```{python}
|
|
1169
|
+
from pathlib import Path
|
|
1170
|
+
|
|
1171
|
+
csv_file = Path(pb.get_data_path("game_revenue", "csv"))
|
|
1172
|
+
|
|
1173
|
+
pb.preview(csv_file, n_head=3, n_tail=3)
|
|
1174
|
+
```
|
|
1175
|
+
|
|
1176
|
+
### Working with Parquet Files
|
|
1177
|
+
|
|
1178
|
+
The `preview()` function can directly accept Parquet files and datasets in various formats:
|
|
1179
|
+
|
|
1180
|
+
```{python}
|
|
1181
|
+
# Single Parquet file from package data
|
|
1182
|
+
parquet_path = pb.get_data_path("nycflights", "parquet")
|
|
1183
|
+
|
|
1184
|
+
pb.preview(parquet_path)
|
|
1185
|
+
```
|
|
1186
|
+
|
|
1187
|
+
You can also use glob patterns and directories:
|
|
1188
|
+
|
|
1189
|
+
```python
|
|
1190
|
+
# Multiple Parquet files with glob patterns
|
|
1191
|
+
pb.preview("data/sales_*.parquet")
|
|
1192
|
+
|
|
1193
|
+
# Directory containing Parquet files
|
|
1194
|
+
pb.preview("parquet_data/")
|
|
1195
|
+
|
|
1196
|
+
# Partitioned Parquet dataset
|
|
1197
|
+
pb.preview("sales_data/") # Auto-discovers partition columns
|
|
1198
|
+
```
|
|
1199
|
+
|
|
1200
|
+
### Working with Database Connection Strings
|
|
1201
|
+
|
|
1202
|
+
The `preview()` function supports database connection strings for direct preview of database
|
|
1203
|
+
tables. Connection strings must specify a table using the `::table_name` suffix:
|
|
1204
|
+
|
|
1205
|
+
```{python}
|
|
1206
|
+
# Get path to a DuckDB database file from package data
|
|
1207
|
+
duckdb_path = pb.get_data_path("game_revenue", "duckdb")
|
|
1208
|
+
|
|
1209
|
+
pb.preview(f"duckdb:///{duckdb_path}::game_revenue")
|
|
1210
|
+
```
|
|
1211
|
+
|
|
1212
|
+
For comprehensive documentation on supported connection string formats, error handling, and
|
|
1213
|
+
installation requirements, see the [`connect_to_table()`](`pointblank.connect_to_table`)
|
|
1214
|
+
function.
|
|
717
1215
|
"""
|
|
718
1216
|
|
|
1217
|
+
# Process input data to handle different data source types
|
|
1218
|
+
# Handle connection string input (e.g., "duckdb:///path/to/file.ddb::table_name")
|
|
1219
|
+
data = _process_connection_string(data)
|
|
1220
|
+
|
|
1221
|
+
# Handle CSV file input (e.g., "data.csv" or Path("data.csv"))
|
|
1222
|
+
data = _process_csv_input(data)
|
|
1223
|
+
|
|
1224
|
+
# Handle Parquet file input (e.g., "data.parquet", "data/*.parquet", "data/")
|
|
1225
|
+
data = _process_parquet_input(data)
|
|
1226
|
+
|
|
719
1227
|
if incl_header is None:
|
|
720
1228
|
incl_header = global_config.preview_incl_header
|
|
721
1229
|
|
|
@@ -913,7 +1421,7 @@ def _generate_display_table(
|
|
|
913
1421
|
k: v.split("(")[0] if "(" in v else v for k, v in col_dtype_dict.items()
|
|
914
1422
|
}
|
|
915
1423
|
|
|
916
|
-
# Create a dictionary of column and row positions where the value is None/NA/
|
|
1424
|
+
# Create a dictionary of column and row positions where the value is None/NA/Null
|
|
917
1425
|
# This is used to highlight these values in the table
|
|
918
1426
|
if df_lib_name_gt == "polars":
|
|
919
1427
|
none_values = {k: data[k].is_null().to_list() for k in col_names}
|
|
@@ -937,7 +1445,10 @@ def _generate_display_table(
|
|
|
937
1445
|
column_values = gt.gt._get_column_of_values(built_gt, column_name=column, context="html")
|
|
938
1446
|
|
|
939
1447
|
# Get the maximum number of characters in the column
|
|
940
|
-
|
|
1448
|
+
if column_values: # Check if column_values is not empty
|
|
1449
|
+
max_length_col_vals.append(max([len(str(val)) for val in column_values]))
|
|
1450
|
+
else:
|
|
1451
|
+
max_length_col_vals.append(0) # Use 0 for empty columns
|
|
941
1452
|
|
|
942
1453
|
length_col_names = [len(column) for column in col_dtype_dict.keys()]
|
|
943
1454
|
length_data_types = [len(dtype) for dtype in col_dtype_dict_short.values()]
|
|
@@ -1008,8 +1519,12 @@ def _generate_display_table(
|
|
|
1008
1519
|
|
|
1009
1520
|
# Get the highest number in the `row_number_list` and calculate a width that will
|
|
1010
1521
|
# safely fit a number of that magnitude
|
|
1011
|
-
|
|
1012
|
-
|
|
1522
|
+
if row_number_list: # Check if list is not empty
|
|
1523
|
+
max_row_num = max(row_number_list)
|
|
1524
|
+
max_row_num_width = len(str(max_row_num)) * 7.8 + 10
|
|
1525
|
+
else:
|
|
1526
|
+
# If row_number_list is empty, use a default width
|
|
1527
|
+
max_row_num_width = 7.8 * 2 + 10 # Width for 2-digit numbers
|
|
1013
1528
|
|
|
1014
1529
|
# Update the col_width_dict to include the row number column
|
|
1015
1530
|
col_width_dict = {"_row_num_": f"{max_row_num_width}px"} | col_width_dict
|
|
@@ -1722,6 +2237,9 @@ def get_column_count(data: FrameT | Any) -> int:
|
|
|
1722
2237
|
elif "pandas" in str(type(data)):
|
|
1723
2238
|
return data.shape[1]
|
|
1724
2239
|
|
|
2240
|
+
elif "narwhals" in str(type(data)):
|
|
2241
|
+
return len(data.columns)
|
|
2242
|
+
|
|
1725
2243
|
else:
|
|
1726
2244
|
raise ValueError("The input table type supplied in `data=` is not supported.")
|
|
1727
2245
|
|
|
@@ -1815,6 +2333,9 @@ def get_row_count(data: FrameT | Any) -> int:
|
|
|
1815
2333
|
elif "pandas" in str(type(data)):
|
|
1816
2334
|
return data.shape[0]
|
|
1817
2335
|
|
|
2336
|
+
elif "narwhals" in str(type(data)):
|
|
2337
|
+
return data.shape[0]
|
|
2338
|
+
|
|
1818
2339
|
else:
|
|
1819
2340
|
raise ValueError("The input table type supplied in `data=` is not supported.")
|
|
1820
2341
|
|
|
@@ -1930,6 +2451,239 @@ class _ValidationInfo:
|
|
|
1930
2451
|
return self.val_info
|
|
1931
2452
|
|
|
1932
2453
|
|
|
2454
|
+
def connect_to_table(connection_string: str) -> Any:
|
|
2455
|
+
"""
|
|
2456
|
+
Connect to a database table using a connection string.
|
|
2457
|
+
|
|
2458
|
+
This utility function tests whether a connection string leads to a valid table and returns
|
|
2459
|
+
the table object if successful. It provides helpful error messages when no table is specified
|
|
2460
|
+
or when backend dependencies are missing.
|
|
2461
|
+
|
|
2462
|
+
Parameters
|
|
2463
|
+
----------
|
|
2464
|
+
connection_string
|
|
2465
|
+
A database connection string with a required table specification using the `::table_name`
|
|
2466
|
+
suffix. Supported formats are outlined in the *Supported Connection String Formats* section.
|
|
2467
|
+
|
|
2468
|
+
Returns
|
|
2469
|
+
-------
|
|
2470
|
+
Any
|
|
2471
|
+
An Ibis table object for the specified database table.
|
|
2472
|
+
|
|
2473
|
+
Supported Connection String Formats
|
|
2474
|
+
-----------------------------------
|
|
2475
|
+
The `connection_string` parameter must include a valid connection string with a table name
|
|
2476
|
+
specified using the `::` syntax. Here are some examples on how to format connection strings
|
|
2477
|
+
for various backends:
|
|
2478
|
+
|
|
2479
|
+
```
|
|
2480
|
+
DuckDB: "duckdb:///path/to/database.ddb::table_name"
|
|
2481
|
+
SQLite: "sqlite:///path/to/database.db::table_name"
|
|
2482
|
+
PostgreSQL: "postgresql://user:password@localhost:5432/database::table_name"
|
|
2483
|
+
MySQL: "mysql://user:password@localhost:3306/database::table_name"
|
|
2484
|
+
BigQuery: "bigquery://project/dataset::table_name"
|
|
2485
|
+
Snowflake: "snowflake://user:password@account/database/schema::table_name"
|
|
2486
|
+
```
|
|
2487
|
+
|
|
2488
|
+
If the connection string does not include a table name, the function will attempt to connect to
|
|
2489
|
+
the database and list available tables, providing guidance on how to specify a table.
|
|
2490
|
+
|
|
2491
|
+
Examples
|
|
2492
|
+
--------
|
|
2493
|
+
Connect to a DuckDB table:
|
|
2494
|
+
|
|
2495
|
+
```{python}
|
|
2496
|
+
import pointblank as pb
|
|
2497
|
+
|
|
2498
|
+
# Get path to a DuckDB database file from package data
|
|
2499
|
+
duckdb_path = pb.get_data_path("game_revenue", "duckdb")
|
|
2500
|
+
|
|
2501
|
+
# Connect to the `game_revenue` table in the DuckDB database
|
|
2502
|
+
game_revenue = pb.connect_to_table(f"duckdb:///{duckdb_path}::game_revenue")
|
|
2503
|
+
|
|
2504
|
+
# Use with the `preview()` function
|
|
2505
|
+
pb.preview(game_revenue)
|
|
2506
|
+
```
|
|
2507
|
+
|
|
2508
|
+
Here are some backend-specific connection examples:
|
|
2509
|
+
|
|
2510
|
+
```python
|
|
2511
|
+
# PostgreSQL
|
|
2512
|
+
pg_table = pb.connect_to_table(
|
|
2513
|
+
"postgresql://user:password@localhost:5432/warehouse::customer_data"
|
|
2514
|
+
)
|
|
2515
|
+
|
|
2516
|
+
# SQLite
|
|
2517
|
+
sqlite_table = pb.connect_to_table("sqlite:///local_data.db::products")
|
|
2518
|
+
|
|
2519
|
+
# BigQuery
|
|
2520
|
+
bq_table = pb.connect_to_table("bigquery://my-project/analytics::daily_metrics")
|
|
2521
|
+
```
|
|
2522
|
+
|
|
2523
|
+
This function requires the Ibis library with appropriate backend drivers:
|
|
2524
|
+
|
|
2525
|
+
```bash
|
|
2526
|
+
# You can install a set of common backends:
|
|
2527
|
+
pip install 'ibis-framework[duckdb,postgres,mysql,sqlite]'
|
|
2528
|
+
|
|
2529
|
+
# ...or specific backends as needed:
|
|
2530
|
+
pip install 'ibis-framework[duckdb]' # for DuckDB
|
|
2531
|
+
pip install 'ibis-framework[postgres]' # for PostgreSQL
|
|
2532
|
+
```
|
|
2533
|
+
"""
|
|
2534
|
+
# Check if Ibis is available
|
|
2535
|
+
if not _is_lib_present(lib_name="ibis"):
|
|
2536
|
+
raise ImportError(
|
|
2537
|
+
"The Ibis library is not installed but is required for database connection strings.\n"
|
|
2538
|
+
"Install it with: pip install 'ibis-framework[duckdb]' (or other backend as needed)"
|
|
2539
|
+
)
|
|
2540
|
+
|
|
2541
|
+
import ibis
|
|
2542
|
+
|
|
2543
|
+
# Check if connection string includes table specification
|
|
2544
|
+
if "::" not in connection_string:
|
|
2545
|
+
# Try to connect to get available tables for helpful error message
|
|
2546
|
+
try:
|
|
2547
|
+
# Extract the base connection string (without table name)
|
|
2548
|
+
base_connection = connection_string
|
|
2549
|
+
|
|
2550
|
+
# Connect to the database
|
|
2551
|
+
conn = ibis.connect(base_connection)
|
|
2552
|
+
|
|
2553
|
+
# Get list of available tables
|
|
2554
|
+
try:
|
|
2555
|
+
available_tables = conn.list_tables()
|
|
2556
|
+
except Exception:
|
|
2557
|
+
available_tables = []
|
|
2558
|
+
|
|
2559
|
+
conn.disconnect()
|
|
2560
|
+
|
|
2561
|
+
# Create helpful error message
|
|
2562
|
+
if available_tables:
|
|
2563
|
+
table_list = "\n".join(f" - {table}" for table in available_tables)
|
|
2564
|
+
error_msg = (
|
|
2565
|
+
f"No table specified in connection string: {connection_string}\n\n"
|
|
2566
|
+
f"Available tables in the database:\n{table_list}\n\n"
|
|
2567
|
+
f"To access a specific table, use the format:\n"
|
|
2568
|
+
f" {connection_string}::TABLE_NAME\n\n"
|
|
2569
|
+
f"Examples:\n"
|
|
2570
|
+
)
|
|
2571
|
+
# Add examples with first few table names
|
|
2572
|
+
for table in available_tables[:3]:
|
|
2573
|
+
error_msg += f" {connection_string}::{table}\n"
|
|
2574
|
+
else:
|
|
2575
|
+
error_msg = (
|
|
2576
|
+
f"No table specified in connection string: {connection_string}\n\n"
|
|
2577
|
+
f"No tables found in the database or unable to list tables.\n\n"
|
|
2578
|
+
f"To access a specific table, use the format:\n"
|
|
2579
|
+
f" {connection_string}::TABLE_NAME"
|
|
2580
|
+
)
|
|
2581
|
+
|
|
2582
|
+
raise ValueError(error_msg)
|
|
2583
|
+
|
|
2584
|
+
except Exception as e:
|
|
2585
|
+
if isinstance(e, ValueError):
|
|
2586
|
+
raise # Re-raise our custom ValueError
|
|
2587
|
+
|
|
2588
|
+
# Check for backend-specific errors and provide installation guidance
|
|
2589
|
+
error_str = str(e).lower()
|
|
2590
|
+
backend_install_map = {
|
|
2591
|
+
"duckdb": "pip install 'ibis-framework[duckdb]'",
|
|
2592
|
+
"postgresql": "pip install 'ibis-framework[postgres]'",
|
|
2593
|
+
"postgres": "pip install 'ibis-framework[postgres]'",
|
|
2594
|
+
"mysql": "pip install 'ibis-framework[mysql]'",
|
|
2595
|
+
"sqlite": "pip install 'ibis-framework[sqlite]'",
|
|
2596
|
+
"bigquery": "pip install 'ibis-framework[bigquery]'",
|
|
2597
|
+
"snowflake": "pip install 'ibis-framework[snowflake]'",
|
|
2598
|
+
}
|
|
2599
|
+
|
|
2600
|
+
# Check if this is a missing backend dependency
|
|
2601
|
+
for backend, install_cmd in backend_install_map.items():
|
|
2602
|
+
if backend in error_str and ("not found" in error_str or "no module" in error_str):
|
|
2603
|
+
raise ConnectionError(
|
|
2604
|
+
f"Missing {backend.upper()} backend for Ibis. Install it with:\n"
|
|
2605
|
+
f" {install_cmd}\n\n"
|
|
2606
|
+
f"Original error: {e}\n\n"
|
|
2607
|
+
f"Supported connection string formats:\n"
|
|
2608
|
+
f"- DuckDB: 'duckdb:///path/to/file.ddb::table_name'\n"
|
|
2609
|
+
f"- SQLite: 'sqlite:///path/to/file.db::table_name'\n"
|
|
2610
|
+
f"- PostgreSQL: 'postgresql://user:pass@host:port/db::table_name'\n"
|
|
2611
|
+
f"- MySQL: 'mysql://user:pass@host:port/db::table_name'\n"
|
|
2612
|
+
f"- BigQuery: 'bigquery://project/dataset::table_name'\n"
|
|
2613
|
+
f"- Snowflake: 'snowflake://user:pass@account/db/schema::table_name'\n"
|
|
2614
|
+
f"\nNote: Use '::table_name' to specify the table within the database."
|
|
2615
|
+
) from e
|
|
2616
|
+
|
|
2617
|
+
# Generic connection error
|
|
2618
|
+
raise ConnectionError(
|
|
2619
|
+
f"Failed to connect to database using connection string: {connection_string}\n"
|
|
2620
|
+
f"Error: {e}\n\n"
|
|
2621
|
+
f"No table specified. Use the format: {connection_string}::TABLE_NAME"
|
|
2622
|
+
) from e
|
|
2623
|
+
|
|
2624
|
+
# Split connection string and table name
|
|
2625
|
+
try:
|
|
2626
|
+
base_connection, table_name = connection_string.rsplit("::", 1)
|
|
2627
|
+
except ValueError:
|
|
2628
|
+
raise ValueError(f"Invalid connection string format: {connection_string}")
|
|
2629
|
+
|
|
2630
|
+
# Connect to database and get table
|
|
2631
|
+
try:
|
|
2632
|
+
conn = ibis.connect(base_connection)
|
|
2633
|
+
table = conn.table(table_name)
|
|
2634
|
+
return table
|
|
2635
|
+
|
|
2636
|
+
except Exception as e:
|
|
2637
|
+
# Check for backend-specific errors and provide installation guidance
|
|
2638
|
+
error_str = str(e).lower()
|
|
2639
|
+
backend_install_map = {
|
|
2640
|
+
"duckdb": "pip install 'ibis-framework[duckdb]'",
|
|
2641
|
+
"postgresql": "pip install 'ibis-framework[postgres]'",
|
|
2642
|
+
"postgres": "pip install 'ibis-framework[postgres]'",
|
|
2643
|
+
"mysql": "pip install 'ibis-framework[mysql]'",
|
|
2644
|
+
"sqlite": "pip install 'ibis-framework[sqlite]'",
|
|
2645
|
+
"bigquery": "pip install 'ibis-framework[bigquery]'",
|
|
2646
|
+
"snowflake": "pip install 'ibis-framework[snowflake]'",
|
|
2647
|
+
}
|
|
2648
|
+
|
|
2649
|
+
# Check if this is a missing backend dependency
|
|
2650
|
+
for backend, install_cmd in backend_install_map.items():
|
|
2651
|
+
if backend in error_str and ("not found" in error_str or "no module" in error_str):
|
|
2652
|
+
raise ConnectionError(
|
|
2653
|
+
f"Missing {backend.upper()} backend for Ibis. Install it with:\n"
|
|
2654
|
+
f" {install_cmd}\n\n"
|
|
2655
|
+
f"Original error: {e}"
|
|
2656
|
+
) from e
|
|
2657
|
+
|
|
2658
|
+
# Check if table doesn't exist
|
|
2659
|
+
if "table" in error_str and ("not found" in error_str or "does not exist" in error_str):
|
|
2660
|
+
# Try to get available tables for helpful message
|
|
2661
|
+
try:
|
|
2662
|
+
available_tables = conn.list_tables()
|
|
2663
|
+
if available_tables:
|
|
2664
|
+
table_list = "\n".join(f" - {table}" for table in available_tables)
|
|
2665
|
+
raise ValueError(
|
|
2666
|
+
f"Table '{table_name}' not found in database.\n\n"
|
|
2667
|
+
f"Available tables:\n{table_list}\n\n"
|
|
2668
|
+
f"Check the table name and try again with:\n"
|
|
2669
|
+
f" {base_connection}::CORRECT_TABLE_NAME"
|
|
2670
|
+
) from e
|
|
2671
|
+
else:
|
|
2672
|
+
raise ValueError(
|
|
2673
|
+
f"Table '{table_name}' not found and no tables available in database."
|
|
2674
|
+
) from e
|
|
2675
|
+
except Exception:
|
|
2676
|
+
raise ValueError(
|
|
2677
|
+
f"Table '{table_name}' not found in database. "
|
|
2678
|
+
f"Check the table name and connection string."
|
|
2679
|
+
) from e
|
|
2680
|
+
|
|
2681
|
+
# Generic connection error
|
|
2682
|
+
raise ConnectionError(
|
|
2683
|
+
f"Failed to connect to table '{table_name}' using: {base_connection}\nError: {e}"
|
|
2684
|
+
) from e
|
|
2685
|
+
|
|
2686
|
+
|
|
1933
2687
|
@dataclass
|
|
1934
2688
|
class Validate:
|
|
1935
2689
|
"""
|
|
@@ -1962,8 +2716,14 @@ class Validate:
|
|
|
1962
2716
|
Parameters
|
|
1963
2717
|
----------
|
|
1964
2718
|
data
|
|
1965
|
-
The table to validate, which could be a DataFrame object
|
|
1966
|
-
|
|
2719
|
+
The table to validate, which could be a DataFrame object, an Ibis table object, a CSV
|
|
2720
|
+
file path, a Parquet file path, or a database connection string. When providing a CSV or
|
|
2721
|
+
Parquet file path (as a string or `pathlib.Path` object), the file will be automatically
|
|
2722
|
+
loaded using an available DataFrame library (Polars or Pandas). Parquet input also supports
|
|
2723
|
+
glob patterns, directories containing .parquet files, and Spark-style partitioned datasets.
|
|
2724
|
+
Connection strings enable direct database access via Ibis with optional table specification
|
|
2725
|
+
using the `::table_name` suffix. Read the *Supported Input Table Types* section for details
|
|
2726
|
+
on the supported table types.
|
|
1967
2727
|
tbl_name
|
|
1968
2728
|
An optional name to assign to the input table object. If no value is provided, a name will
|
|
1969
2729
|
be generated based on whatever information is available. This table name will be displayed
|
|
@@ -2033,12 +2793,34 @@ class Validate:
|
|
|
2033
2793
|
- PySpark table (`"pyspark"`)*
|
|
2034
2794
|
- BigQuery table (`"bigquery"`)*
|
|
2035
2795
|
- Parquet table (`"parquet"`)*
|
|
2796
|
+
- CSV files (string path or `pathlib.Path` object with `.csv` extension)
|
|
2797
|
+
- Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
|
|
2798
|
+
extension, or partitioned dataset)
|
|
2799
|
+
- Database connection strings (URI format with optional table specification)
|
|
2036
2800
|
|
|
2037
2801
|
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
|
|
2038
2802
|
`ibis.expr.types.relations.Table`). Furthermore, the use of `Validate` with such tables requires
|
|
2039
2803
|
the Ibis library v9.5.0 and above to be installed. If the input table is a Polars or Pandas
|
|
2040
2804
|
DataFrame, the Ibis library is not required.
|
|
2041
2805
|
|
|
2806
|
+
To use a CSV file, ensure that a string or `pathlib.Path` object with a `.csv` extension is
|
|
2807
|
+
provided. The file will be automatically detected and loaded using the best available DataFrame
|
|
2808
|
+
library. The loading preference is Polars first, then Pandas as a fallback.
|
|
2809
|
+
|
|
2810
|
+
Connection strings follow database URL formats and must also specify a table using the
|
|
2811
|
+
`::table_name` suffix. Examples include:
|
|
2812
|
+
|
|
2813
|
+
```
|
|
2814
|
+
"duckdb:///path/to/database.ddb::table_name"
|
|
2815
|
+
"sqlite:///path/to/database.db::table_name"
|
|
2816
|
+
"postgresql://user:password@localhost:5432/database::table_name"
|
|
2817
|
+
"mysql://user:password@localhost:3306/database::table_name"
|
|
2818
|
+
"bigquery://project/dataset::table_name"
|
|
2819
|
+
"snowflake://user:password@account/database/schema::table_name"
|
|
2820
|
+
```
|
|
2821
|
+
|
|
2822
|
+
When using connection strings, the Ibis library with the appropriate backend driver is required.
|
|
2823
|
+
|
|
2042
2824
|
Thresholds
|
|
2043
2825
|
----------
|
|
2044
2826
|
The `thresholds=` parameter is used to set the failure-condition levels for all validation
|
|
@@ -2195,8 +2977,8 @@ class Validate:
|
|
|
2195
2977
|
```{python}
|
|
2196
2978
|
import pointblank as pb
|
|
2197
2979
|
|
|
2198
|
-
# Load the small_table dataset
|
|
2199
|
-
small_table = pb.load_dataset()
|
|
2980
|
+
# Load the `small_table` dataset
|
|
2981
|
+
small_table = pb.load_dataset(dataset="small_table", tbl_type="polars")
|
|
2200
2982
|
|
|
2201
2983
|
# Preview the table
|
|
2202
2984
|
pb.preview(small_table)
|
|
@@ -2262,7 +3044,7 @@ class Validate:
|
|
|
2262
3044
|
brief). Here's an example of a global setting for briefs:
|
|
2263
3045
|
|
|
2264
3046
|
```{python}
|
|
2265
|
-
|
|
3047
|
+
validation_2 = (
|
|
2266
3048
|
pb.Validate(
|
|
2267
3049
|
data=pb.load_dataset(),
|
|
2268
3050
|
tbl_name="small_table",
|
|
@@ -2279,7 +3061,7 @@ class Validate:
|
|
|
2279
3061
|
.interrogate()
|
|
2280
3062
|
)
|
|
2281
3063
|
|
|
2282
|
-
|
|
3064
|
+
validation_2
|
|
2283
3065
|
```
|
|
2284
3066
|
|
|
2285
3067
|
We see the text of the briefs appear in the `STEP` column of the reporting table. Furthermore,
|
|
@@ -2297,7 +3079,7 @@ class Validate:
|
|
|
2297
3079
|
the data extracts for each validation step.
|
|
2298
3080
|
|
|
2299
3081
|
```{python}
|
|
2300
|
-
|
|
3082
|
+
validation_2.get_data_extracts()
|
|
2301
3083
|
```
|
|
2302
3084
|
|
|
2303
3085
|
We can also view step reports for each validation step using the
|
|
@@ -2305,7 +3087,7 @@ class Validate:
|
|
|
2305
3087
|
type of validation step and shows the relevant information for a step's validation.
|
|
2306
3088
|
|
|
2307
3089
|
```{python}
|
|
2308
|
-
|
|
3090
|
+
validation_2.get_step_report(i=2)
|
|
2309
3091
|
```
|
|
2310
3092
|
|
|
2311
3093
|
The `Validate` class also has a method for getting the sundered data, which is the data that
|
|
@@ -2313,11 +3095,141 @@ class Validate:
|
|
|
2313
3095
|
[`get_sundered_data()`](`pointblank.Validate.get_sundered_data`) method.
|
|
2314
3096
|
|
|
2315
3097
|
```{python}
|
|
2316
|
-
pb.preview(
|
|
3098
|
+
pb.preview(validation_2.get_sundered_data())
|
|
2317
3099
|
```
|
|
2318
3100
|
|
|
2319
3101
|
The sundered data is a DataFrame that contains the rows that passed or failed the validation.
|
|
2320
3102
|
The default behavior is to return the rows that failed the validation, as shown above.
|
|
3103
|
+
|
|
3104
|
+
### Working with CSV Files
|
|
3105
|
+
|
|
3106
|
+
The `Validate` class can directly accept CSV file paths, making it easy to validate data stored
|
|
3107
|
+
in CSV files without manual loading:
|
|
3108
|
+
|
|
3109
|
+
```{python}
|
|
3110
|
+
# Get a path to a CSV file from the package data
|
|
3111
|
+
csv_path = pb.get_data_path("global_sales", "csv")
|
|
3112
|
+
|
|
3113
|
+
validation_3 = (
|
|
3114
|
+
pb.Validate(
|
|
3115
|
+
data=csv_path,
|
|
3116
|
+
label="CSV validation example"
|
|
3117
|
+
)
|
|
3118
|
+
.col_exists(["customer_id", "product_id", "revenue"])
|
|
3119
|
+
.col_vals_not_null(["customer_id", "product_id"])
|
|
3120
|
+
.col_vals_gt(columns="revenue", value=0)
|
|
3121
|
+
.interrogate()
|
|
3122
|
+
)
|
|
3123
|
+
|
|
3124
|
+
validation_3
|
|
3125
|
+
```
|
|
3126
|
+
|
|
3127
|
+
You can also use a Path object to specify the CSV file. Here's an example of how to do that:
|
|
3128
|
+
|
|
3129
|
+
```{python}
|
|
3130
|
+
from pathlib import Path
|
|
3131
|
+
|
|
3132
|
+
csv_file = Path(pb.get_data_path("game_revenue", "csv"))
|
|
3133
|
+
|
|
3134
|
+
validation_4 = (
|
|
3135
|
+
pb.Validate(data=csv_file, label="Game Revenue Validation")
|
|
3136
|
+
.col_exists(["player_id", "session_id", "item_name"])
|
|
3137
|
+
.col_vals_regex(
|
|
3138
|
+
columns="session_id",
|
|
3139
|
+
pattern=r"[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}"
|
|
3140
|
+
)
|
|
3141
|
+
.col_vals_gt(columns="item_revenue", value=0, na_pass=True)
|
|
3142
|
+
.interrogate()
|
|
3143
|
+
)
|
|
3144
|
+
|
|
3145
|
+
validation_4
|
|
3146
|
+
```
|
|
3147
|
+
|
|
3148
|
+
The CSV loading is automatic, so when a string or Path with a `.csv` extension is provided,
|
|
3149
|
+
Pointblank will automatically load the file using the best available DataFrame library (Polars
|
|
3150
|
+
preferred, Pandas as fallback). The loaded data can then be used with all validation methods
|
|
3151
|
+
just like any other supported table type.
|
|
3152
|
+
|
|
3153
|
+
### Working with Parquet Files
|
|
3154
|
+
|
|
3155
|
+
The `Validate` class can directly accept Parquet files and datasets in various formats. The
|
|
3156
|
+
following examples illustrate how to validate Parquet files:
|
|
3157
|
+
|
|
3158
|
+
```{python}
|
|
3159
|
+
# Single Parquet file from package data
|
|
3160
|
+
parquet_path = pb.get_data_path("nycflights", "parquet")
|
|
3161
|
+
|
|
3162
|
+
validation_5 = (
|
|
3163
|
+
pb.Validate(
|
|
3164
|
+
data=parquet_path,
|
|
3165
|
+
tbl_name="NYC Flights Data"
|
|
3166
|
+
)
|
|
3167
|
+
.col_vals_not_null(["carrier", "origin", "dest"])
|
|
3168
|
+
.col_vals_gt(columns="distance", value=0)
|
|
3169
|
+
.interrogate()
|
|
3170
|
+
)
|
|
3171
|
+
|
|
3172
|
+
validation_5
|
|
3173
|
+
```
|
|
3174
|
+
|
|
3175
|
+
You can also use glob patterns and directories. Here are some examples for how to:
|
|
3176
|
+
|
|
3177
|
+
1. load multiple Parquet files
|
|
3178
|
+
2. load a Parquet-containing directory
|
|
3179
|
+
3. load a partitioned Parquet dataset
|
|
3180
|
+
|
|
3181
|
+
```python
|
|
3182
|
+
# Multiple Parquet files with glob patterns
|
|
3183
|
+
validation_6 = pb.Validate(data="data/sales_*.parquet")
|
|
3184
|
+
|
|
3185
|
+
# Directory containing Parquet files
|
|
3186
|
+
validation_7 = pb.Validate(data="parquet_data/")
|
|
3187
|
+
|
|
3188
|
+
# Partitioned Parquet dataset
|
|
3189
|
+
validation_8 = (
|
|
3190
|
+
pb.Validate(data="sales_data/") # Contains year=2023/quarter=Q1/region=US/sales.parquet
|
|
3191
|
+
.col_exists(["transaction_id", "amount", "year", "quarter", "region"])
|
|
3192
|
+
.interrogate()
|
|
3193
|
+
)
|
|
3194
|
+
```
|
|
3195
|
+
|
|
3196
|
+
When you point to a directory that contains a partitioned Parquet dataset (with subdirectories
|
|
3197
|
+
like `year=2023/quarter=Q1/region=US/`), Pointblank will automatically:
|
|
3198
|
+
|
|
3199
|
+
- discover all Parquet files recursively
|
|
3200
|
+
- extract partition column values from directory paths
|
|
3201
|
+
- add partition columns to the final DataFrame
|
|
3202
|
+
- combine all partitions into a single table for validation
|
|
3203
|
+
|
|
3204
|
+
Both Polars and Pandas handle partitioned datasets natively, so this works seamlessly with
|
|
3205
|
+
either DataFrame library. The loading preference is Polars first, then Pandas as a fallback.
|
|
3206
|
+
|
|
3207
|
+
### Working with Database Connection Strings
|
|
3208
|
+
|
|
3209
|
+
The `Validate` class supports database connection strings for direct validation of database
|
|
3210
|
+
tables. Connection strings must specify a table using the `::table_name` suffix:
|
|
3211
|
+
|
|
3212
|
+
```{python}
|
|
3213
|
+
# Get path to a DuckDB database file from package data
|
|
3214
|
+
duckdb_path = pb.get_data_path("game_revenue", "duckdb")
|
|
3215
|
+
|
|
3216
|
+
validation_9 = (
|
|
3217
|
+
pb.Validate(
|
|
3218
|
+
data=f"duckdb:///{duckdb_path}::game_revenue",
|
|
3219
|
+
label="DuckDB Game Revenue Validation"
|
|
3220
|
+
)
|
|
3221
|
+
.col_exists(["player_id", "session_id", "item_revenue"])
|
|
3222
|
+
.col_vals_gt(columns="item_revenue", value=0)
|
|
3223
|
+
.interrogate()
|
|
3224
|
+
)
|
|
3225
|
+
|
|
3226
|
+
validation_9
|
|
3227
|
+
```
|
|
3228
|
+
|
|
3229
|
+
For comprehensive documentation on supported connection string formats, error handling, and
|
|
3230
|
+
installation requirements, see the [`connect_to_table()`](`pointblank.connect_to_table`)
|
|
3231
|
+
function. This function handles all the connection logic and provides helpful error messages
|
|
3232
|
+
when table specifications are missing or backend dependencies are not installed.
|
|
2321
3233
|
"""
|
|
2322
3234
|
|
|
2323
3235
|
data: FrameT | Any
|
|
@@ -2331,6 +3243,15 @@ class Validate:
|
|
|
2331
3243
|
locale: str | None = None
|
|
2332
3244
|
|
|
2333
3245
|
def __post_init__(self):
|
|
3246
|
+
# Handle connection string input for the data parameter
|
|
3247
|
+
self.data = _process_connection_string(self.data)
|
|
3248
|
+
|
|
3249
|
+
# Handle CSV file input for the data parameter
|
|
3250
|
+
self.data = _process_csv_input(self.data)
|
|
3251
|
+
|
|
3252
|
+
# Handle Parquet file input for the data parameter
|
|
3253
|
+
self.data = _process_parquet_input(self.data)
|
|
3254
|
+
|
|
2334
3255
|
# Check input of the `thresholds=` argument
|
|
2335
3256
|
_check_thresholds(thresholds=self.thresholds)
|
|
2336
3257
|
|
|
@@ -2506,12 +3427,16 @@ class Validate:
|
|
|
2506
3427
|
(i.e., no validation steps will be created for them).
|
|
2507
3428
|
|
|
2508
3429
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
2509
|
-
for more complex segmentation scenarios. The following inputs are
|
|
3430
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
2510
3431
|
|
|
2511
|
-
|
|
2512
|
-
|
|
2513
|
-
|
|
2514
|
-
|
|
3432
|
+
```
|
|
3433
|
+
# Segments from all unique values in the `region` column
|
|
3434
|
+
# and specific dates in the `date` column
|
|
3435
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
3436
|
+
|
|
3437
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
3438
|
+
segments=["region", "date"]
|
|
3439
|
+
```
|
|
2515
3440
|
|
|
2516
3441
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
2517
3442
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -2794,12 +3719,16 @@ class Validate:
|
|
|
2794
3719
|
(i.e., no validation steps will be created for them).
|
|
2795
3720
|
|
|
2796
3721
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
2797
|
-
for more complex segmentation scenarios. The following inputs are
|
|
3722
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
2798
3723
|
|
|
2799
|
-
|
|
2800
|
-
|
|
2801
|
-
|
|
2802
|
-
|
|
3724
|
+
```
|
|
3725
|
+
# Segments from all unique values in the `region` column
|
|
3726
|
+
# and specific dates in the `date` column
|
|
3727
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
3728
|
+
|
|
3729
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
3730
|
+
segments=["region", "date"]
|
|
3731
|
+
```
|
|
2803
3732
|
|
|
2804
3733
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
2805
3734
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -3081,12 +4010,16 @@ class Validate:
|
|
|
3081
4010
|
(i.e., no validation steps will be created for them).
|
|
3082
4011
|
|
|
3083
4012
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3084
|
-
for more complex segmentation scenarios. The following inputs are
|
|
4013
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
3085
4014
|
|
|
3086
|
-
|
|
3087
|
-
|
|
3088
|
-
|
|
3089
|
-
|
|
4015
|
+
```
|
|
4016
|
+
# Segments from all unique values in the `region` column
|
|
4017
|
+
# and specific dates in the `date` column
|
|
4018
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
4019
|
+
|
|
4020
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
4021
|
+
segments=["region", "date"]
|
|
4022
|
+
```
|
|
3090
4023
|
|
|
3091
4024
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3092
4025
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -3367,12 +4300,16 @@ class Validate:
|
|
|
3367
4300
|
(i.e., no validation steps will be created for them).
|
|
3368
4301
|
|
|
3369
4302
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3370
|
-
for more complex segmentation scenarios. The following inputs are
|
|
4303
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
3371
4304
|
|
|
3372
|
-
|
|
3373
|
-
|
|
3374
|
-
|
|
3375
|
-
|
|
4305
|
+
```
|
|
4306
|
+
# Segments from all unique values in the `region` column
|
|
4307
|
+
# and specific dates in the `date` column
|
|
4308
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
4309
|
+
|
|
4310
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
4311
|
+
segments=["region", "date"]
|
|
4312
|
+
```
|
|
3376
4313
|
|
|
3377
4314
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3378
4315
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -3651,12 +4588,16 @@ class Validate:
|
|
|
3651
4588
|
(i.e., no validation steps will be created for them).
|
|
3652
4589
|
|
|
3653
4590
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3654
|
-
for more complex segmentation scenarios. The following inputs are
|
|
4591
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
3655
4592
|
|
|
3656
|
-
|
|
3657
|
-
|
|
3658
|
-
|
|
3659
|
-
|
|
4593
|
+
```
|
|
4594
|
+
# Segments from all unique values in the `region` column
|
|
4595
|
+
# and specific dates in the `date` column
|
|
4596
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
4597
|
+
|
|
4598
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
4599
|
+
segments=["region", "date"]
|
|
4600
|
+
```
|
|
3660
4601
|
|
|
3661
4602
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3662
4603
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -3939,12 +4880,16 @@ class Validate:
|
|
|
3939
4880
|
(i.e., no validation steps will be created for them).
|
|
3940
4881
|
|
|
3941
4882
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3942
|
-
for more complex segmentation scenarios. The following inputs are
|
|
4883
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
3943
4884
|
|
|
3944
|
-
|
|
3945
|
-
|
|
3946
|
-
|
|
3947
|
-
|
|
4885
|
+
```
|
|
4886
|
+
# Segments from all unique values in the `region` column
|
|
4887
|
+
# and specific dates in the `date` column
|
|
4888
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
4889
|
+
|
|
4890
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
4891
|
+
segments=["region", "date"]
|
|
4892
|
+
```
|
|
3948
4893
|
|
|
3949
4894
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3950
4895
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -4241,12 +5186,16 @@ class Validate:
|
|
|
4241
5186
|
(i.e., no validation steps will be created for them).
|
|
4242
5187
|
|
|
4243
5188
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
4244
|
-
for more complex segmentation scenarios. The following inputs are
|
|
5189
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
4245
5190
|
|
|
4246
|
-
|
|
4247
|
-
|
|
4248
|
-
|
|
4249
|
-
|
|
5191
|
+
```
|
|
5192
|
+
# Segments from all unique values in the `region` column
|
|
5193
|
+
# and specific dates in the `date` column
|
|
5194
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
5195
|
+
|
|
5196
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
5197
|
+
segments=["region", "date"]
|
|
5198
|
+
```
|
|
4250
5199
|
|
|
4251
5200
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
4252
5201
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -4557,12 +5506,16 @@ class Validate:
|
|
|
4557
5506
|
(i.e., no validation steps will be created for them).
|
|
4558
5507
|
|
|
4559
5508
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
4560
|
-
for more complex segmentation scenarios. The following inputs are
|
|
5509
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
4561
5510
|
|
|
4562
|
-
|
|
4563
|
-
|
|
4564
|
-
|
|
4565
|
-
|
|
5511
|
+
```
|
|
5512
|
+
# Segments from all unique values in the `region` column
|
|
5513
|
+
# and specific dates in the `date` column
|
|
5514
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
5515
|
+
|
|
5516
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
5517
|
+
segments=["region", "date"]
|
|
5518
|
+
```
|
|
4566
5519
|
|
|
4567
5520
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
4568
5521
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -4829,12 +5782,16 @@ class Validate:
|
|
|
4829
5782
|
(i.e., no validation steps will be created for them).
|
|
4830
5783
|
|
|
4831
5784
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
4832
|
-
for more complex segmentation scenarios. The following inputs are
|
|
5785
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
4833
5786
|
|
|
4834
|
-
|
|
4835
|
-
|
|
4836
|
-
|
|
4837
|
-
|
|
5787
|
+
```
|
|
5788
|
+
# Segments from all unique values in the `region` column
|
|
5789
|
+
# and specific dates in the `date` column
|
|
5790
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
5791
|
+
|
|
5792
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
5793
|
+
segments=["region", "date"]
|
|
5794
|
+
```
|
|
4838
5795
|
|
|
4839
5796
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
4840
5797
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -5082,12 +6039,16 @@ class Validate:
|
|
|
5082
6039
|
(i.e., no validation steps will be created for them).
|
|
5083
6040
|
|
|
5084
6041
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
5085
|
-
for more complex segmentation scenarios. The following inputs are
|
|
6042
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
5086
6043
|
|
|
5087
|
-
|
|
5088
|
-
|
|
5089
|
-
|
|
5090
|
-
|
|
6044
|
+
```
|
|
6045
|
+
# Segments from all unique values in the `region` column
|
|
6046
|
+
# and specific dates in the `date` column
|
|
6047
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
6048
|
+
|
|
6049
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
6050
|
+
segments=["region", "date"]
|
|
6051
|
+
```
|
|
5091
6052
|
|
|
5092
6053
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
5093
6054
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -5243,9 +6204,9 @@ class Validate:
|
|
|
5243
6204
|
active: bool = True,
|
|
5244
6205
|
) -> Validate:
|
|
5245
6206
|
"""
|
|
5246
|
-
Validate whether values in a column are
|
|
6207
|
+
Validate whether values in a column are Null.
|
|
5247
6208
|
|
|
5248
|
-
The `col_vals_null()` validation method checks whether column values in a table are
|
|
6209
|
+
The `col_vals_null()` validation method checks whether column values in a table are Null.
|
|
5249
6210
|
This validation will operate over the number of test units that is equal to the number
|
|
5250
6211
|
of rows in the table.
|
|
5251
6212
|
|
|
@@ -5326,12 +6287,16 @@ class Validate:
|
|
|
5326
6287
|
(i.e., no validation steps will be created for them).
|
|
5327
6288
|
|
|
5328
6289
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
5329
|
-
for more complex segmentation scenarios. The following inputs are
|
|
6290
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
5330
6291
|
|
|
5331
|
-
|
|
5332
|
-
|
|
5333
|
-
|
|
5334
|
-
|
|
6292
|
+
```
|
|
6293
|
+
# Segments from all unique values in the `region` column
|
|
6294
|
+
# and specific dates in the `date` column
|
|
6295
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
6296
|
+
|
|
6297
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
6298
|
+
segments=["region", "date"]
|
|
6299
|
+
```
|
|
5335
6300
|
|
|
5336
6301
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
5337
6302
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -5482,10 +6447,10 @@ class Validate:
|
|
|
5482
6447
|
active: bool = True,
|
|
5483
6448
|
) -> Validate:
|
|
5484
6449
|
"""
|
|
5485
|
-
Validate whether values in a column are not
|
|
6450
|
+
Validate whether values in a column are not Null.
|
|
5486
6451
|
|
|
5487
6452
|
The `col_vals_not_null()` validation method checks whether column values in a table are not
|
|
5488
|
-
|
|
6453
|
+
Null. This validation will operate over the number of test units that is equal to the number
|
|
5489
6454
|
of rows in the table.
|
|
5490
6455
|
|
|
5491
6456
|
Parameters
|
|
@@ -5565,12 +6530,16 @@ class Validate:
|
|
|
5565
6530
|
(i.e., no validation steps will be created for them).
|
|
5566
6531
|
|
|
5567
6532
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
5568
|
-
for more complex segmentation scenarios. The following inputs are
|
|
6533
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
5569
6534
|
|
|
5570
|
-
|
|
5571
|
-
|
|
5572
|
-
|
|
5573
|
-
|
|
6535
|
+
```
|
|
6536
|
+
# Segments from all unique values in the `region` column
|
|
6537
|
+
# and specific dates in the `date` column
|
|
6538
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
6539
|
+
|
|
6540
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
6541
|
+
segments=["region", "date"]
|
|
6542
|
+
```
|
|
5574
6543
|
|
|
5575
6544
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
5576
6545
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -5812,12 +6781,16 @@ class Validate:
|
|
|
5812
6781
|
(i.e., no validation steps will be created for them).
|
|
5813
6782
|
|
|
5814
6783
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
5815
|
-
for more complex segmentation scenarios. The following inputs are
|
|
6784
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
5816
6785
|
|
|
5817
|
-
|
|
5818
|
-
|
|
5819
|
-
|
|
5820
|
-
|
|
6786
|
+
```
|
|
6787
|
+
# Segments from all unique values in the `region` column
|
|
6788
|
+
# and specific dates in the `date` column
|
|
6789
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
6790
|
+
|
|
6791
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
6792
|
+
segments=["region", "date"]
|
|
6793
|
+
```
|
|
5821
6794
|
|
|
5822
6795
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
5823
6796
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -6055,12 +7028,16 @@ class Validate:
|
|
|
6055
7028
|
(i.e., no validation steps will be created for them).
|
|
6056
7029
|
|
|
6057
7030
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
6058
|
-
for more complex segmentation scenarios. The following inputs are
|
|
7031
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
6059
7032
|
|
|
6060
|
-
|
|
6061
|
-
|
|
6062
|
-
|
|
6063
|
-
|
|
7033
|
+
```
|
|
7034
|
+
# Segments from all unique values in the `region` column
|
|
7035
|
+
# and specific dates in the `date` column
|
|
7036
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
7037
|
+
|
|
7038
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
7039
|
+
segments=["region", "date"]
|
|
7040
|
+
```
|
|
6064
7041
|
|
|
6065
7042
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
6066
7043
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -6446,12 +7423,16 @@ class Validate:
|
|
|
6446
7423
|
(i.e., no validation steps will be created for them).
|
|
6447
7424
|
|
|
6448
7425
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
6449
|
-
for more complex segmentation scenarios. The following inputs are
|
|
7426
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
6450
7427
|
|
|
6451
|
-
|
|
6452
|
-
|
|
6453
|
-
|
|
6454
|
-
|
|
7428
|
+
```
|
|
7429
|
+
# Segments from all unique values in the `region` column
|
|
7430
|
+
# and specific dates in the `date` column
|
|
7431
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
7432
|
+
|
|
7433
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
7434
|
+
segments=["region", "date"]
|
|
7435
|
+
```
|
|
6455
7436
|
|
|
6456
7437
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
6457
7438
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -6683,12 +7664,16 @@ class Validate:
|
|
|
6683
7664
|
(i.e., no validation steps will be created for them).
|
|
6684
7665
|
|
|
6685
7666
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
6686
|
-
for more complex segmentation scenarios. The following inputs are
|
|
7667
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
6687
7668
|
|
|
6688
|
-
|
|
6689
|
-
|
|
6690
|
-
|
|
6691
|
-
|
|
7669
|
+
```
|
|
7670
|
+
# Segments from all unique values in the `region` column
|
|
7671
|
+
# and specific dates in the `date` column
|
|
7672
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
7673
|
+
|
|
7674
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
7675
|
+
segments=["region", "date"]
|
|
7676
|
+
```
|
|
6692
7677
|
|
|
6693
7678
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
6694
7679
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -8241,37 +9226,47 @@ class Validate:
|
|
|
8241
9226
|
|
|
8242
9227
|
# Determine whether any preprocessing functions are to be applied to the table
|
|
8243
9228
|
if validation.pre is not None:
|
|
8244
|
-
|
|
8245
|
-
|
|
9229
|
+
try:
|
|
9230
|
+
# Read the text of the preprocessing function
|
|
9231
|
+
pre_text = _pre_processing_funcs_to_str(validation.pre)
|
|
9232
|
+
|
|
9233
|
+
# Determine if the preprocessing function is a lambda function; return a boolean
|
|
9234
|
+
is_lambda = re.match(r"^lambda", pre_text) is not None
|
|
8246
9235
|
|
|
8247
|
-
|
|
8248
|
-
|
|
9236
|
+
# If the preprocessing function is a lambda function, then check if there is
|
|
9237
|
+
# a keyword argument called `dfn` in the lamda signature; if so, that's a cue
|
|
9238
|
+
# to use a Narwhalified version of the table
|
|
9239
|
+
if is_lambda:
|
|
9240
|
+
# Get the signature of the lambda function
|
|
9241
|
+
sig = inspect.signature(validation.pre)
|
|
8249
9242
|
|
|
8250
|
-
|
|
8251
|
-
|
|
8252
|
-
|
|
8253
|
-
|
|
8254
|
-
# Get the signature of the lambda function
|
|
8255
|
-
sig = inspect.signature(validation.pre)
|
|
9243
|
+
# Check if the lambda function has a keyword argument called `dfn`
|
|
9244
|
+
if "dfn" in sig.parameters:
|
|
9245
|
+
# Convert the table to a Narwhals DataFrame
|
|
9246
|
+
data_tbl_step = nw.from_native(data_tbl_step)
|
|
8256
9247
|
|
|
8257
|
-
|
|
8258
|
-
|
|
8259
|
-
# Convert the table to a Narwhals DataFrame
|
|
8260
|
-
data_tbl_step = nw.from_native(data_tbl_step)
|
|
9248
|
+
# Apply the preprocessing function to the table
|
|
9249
|
+
data_tbl_step = validation.pre(dfn=data_tbl_step)
|
|
8261
9250
|
|
|
8262
|
-
|
|
8263
|
-
|
|
9251
|
+
# Convert the table back to its original format
|
|
9252
|
+
data_tbl_step = nw.to_native(data_tbl_step)
|
|
8264
9253
|
|
|
8265
|
-
|
|
8266
|
-
|
|
9254
|
+
else:
|
|
9255
|
+
# Apply the preprocessing function to the table
|
|
9256
|
+
data_tbl_step = validation.pre(data_tbl_step)
|
|
8267
9257
|
|
|
8268
|
-
|
|
8269
|
-
|
|
9258
|
+
# If the preprocessing function is a function, apply it to the table
|
|
9259
|
+
elif isinstance(validation.pre, Callable):
|
|
8270
9260
|
data_tbl_step = validation.pre(data_tbl_step)
|
|
8271
9261
|
|
|
8272
|
-
|
|
8273
|
-
|
|
8274
|
-
|
|
9262
|
+
except Exception:
|
|
9263
|
+
# If preprocessing fails, mark the validation as having an eval_error
|
|
9264
|
+
validation.eval_error = True
|
|
9265
|
+
end_time = datetime.datetime.now(datetime.timezone.utc)
|
|
9266
|
+
validation.proc_duration_s = (end_time - start_time).total_seconds()
|
|
9267
|
+
validation.time_processed = end_time.isoformat(timespec="milliseconds")
|
|
9268
|
+
validation.active = False
|
|
9269
|
+
continue
|
|
8275
9270
|
|
|
8276
9271
|
# ------------------------------------------------
|
|
8277
9272
|
# Segmentation stage
|
|
@@ -8284,12 +9279,28 @@ class Validate:
|
|
|
8284
9279
|
data_tbl=data_tbl_step, segments_expr=validation.segments
|
|
8285
9280
|
)
|
|
8286
9281
|
|
|
9282
|
+
# ------------------------------------------------
|
|
9283
|
+
# Determine table type and `collect()` if needed
|
|
9284
|
+
# ------------------------------------------------
|
|
9285
|
+
|
|
9286
|
+
if tbl_type not in IBIS_BACKENDS:
|
|
9287
|
+
tbl_type = "local"
|
|
9288
|
+
|
|
9289
|
+
# If the table is a lazy frame, we need to collect it
|
|
9290
|
+
if _is_lazy_frame(data_tbl_step):
|
|
9291
|
+
data_tbl_step = data_tbl_step.collect()
|
|
9292
|
+
|
|
9293
|
+
# ------------------------------------------------
|
|
9294
|
+
# Set the number of test units
|
|
9295
|
+
# ------------------------------------------------
|
|
9296
|
+
|
|
8287
9297
|
validation.n = NumberOfTestUnits(df=data_tbl_step, column=column).get_test_units(
|
|
8288
9298
|
tbl_type=tbl_type
|
|
8289
9299
|
)
|
|
8290
9300
|
|
|
8291
|
-
|
|
8292
|
-
|
|
9301
|
+
# ------------------------------------------------
|
|
9302
|
+
# Validation stage
|
|
9303
|
+
# ------------------------------------------------
|
|
8293
9304
|
|
|
8294
9305
|
if assertion_category == "COMPARE_ONE":
|
|
8295
9306
|
results_tbl = ColValsCompareOne(
|
|
@@ -8480,36 +9491,32 @@ class Validate:
|
|
|
8480
9491
|
|
|
8481
9492
|
else:
|
|
8482
9493
|
# If the result is not a list, then we assume it's a table in the conventional
|
|
8483
|
-
# form (where the column is `pb_is_good_` exists, with boolean values
|
|
8484
|
-
|
|
9494
|
+
# form (where the column is `pb_is_good_` exists, with boolean values
|
|
8485
9495
|
results_tbl = results_tbl_list
|
|
8486
9496
|
|
|
8487
9497
|
# If the results table is not `None`, then we assume there is a table with a column
|
|
8488
9498
|
# called `pb_is_good_` that contains boolean values; we can then use this table to
|
|
8489
9499
|
# determine the number of test units that passed and failed
|
|
8490
9500
|
if results_tbl is not None:
|
|
8491
|
-
#
|
|
8492
|
-
|
|
8493
|
-
|
|
8494
|
-
|
|
8495
|
-
|
|
8496
|
-
|
|
8497
|
-
|
|
8498
|
-
results_list = (
|
|
8499
|
-
results_tbl.select("pb_is_good_").to_pandas()["pb_is_good_"].to_list()
|
|
8500
|
-
)
|
|
8501
|
-
else:
|
|
8502
|
-
results_list = (
|
|
8503
|
-
results_tbl.select("pb_is_good_").to_polars()["pb_is_good_"].to_list()
|
|
8504
|
-
)
|
|
9501
|
+
# Count the number of passing and failing test units
|
|
9502
|
+
validation.n_passed = _count_true_values_in_column(
|
|
9503
|
+
tbl=results_tbl, column="pb_is_good_"
|
|
9504
|
+
)
|
|
9505
|
+
validation.n_failed = _count_true_values_in_column(
|
|
9506
|
+
tbl=results_tbl, column="pb_is_good_", inverse=True
|
|
9507
|
+
)
|
|
8505
9508
|
|
|
8506
|
-
|
|
8507
|
-
|
|
9509
|
+
# Solely for the col_vals_in_set assertion type, any Null values in the
|
|
9510
|
+
# `pb_is_good_` column are counted as failing test units
|
|
9511
|
+
if assertion_type == "col_vals_in_set":
|
|
9512
|
+
null_count = _count_null_values_in_column(tbl=results_tbl, column="pb_is_good_")
|
|
9513
|
+
validation.n_failed += null_count
|
|
8508
9514
|
|
|
8509
|
-
|
|
8510
|
-
validation.n =
|
|
8511
|
-
|
|
8512
|
-
|
|
9515
|
+
# For column-value validations, the number of test units is the number of rows
|
|
9516
|
+
validation.n = get_row_count(data=results_tbl)
|
|
9517
|
+
|
|
9518
|
+
# Set the `all_passed` attribute based on whether there are any failing test units
|
|
9519
|
+
validation.all_passed = validation.n_failed == 0
|
|
8513
9520
|
|
|
8514
9521
|
# Calculate fractions of passing and failing test units
|
|
8515
9522
|
# - `f_passed` is the fraction of test units that passed
|
|
@@ -9818,7 +10825,7 @@ class Validate:
|
|
|
9818
10825
|
Get the 'critical' level status for each validation step.
|
|
9819
10826
|
|
|
9820
10827
|
The 'critical' status for a validation step is `True` if the fraction of failing test units
|
|
9821
|
-
meets or exceeds the threshold for the
|
|
10828
|
+
meets or exceeds the threshold for the 'critical' level. Otherwise, the status is `False`.
|
|
9822
10829
|
|
|
9823
10830
|
The ascribed name of 'critical' is semantic and is thus simply a status indicator that could
|
|
9824
10831
|
be used to trigger some action to be take. Here's how it fits in with other status
|
|
@@ -9830,14 +10837,14 @@ class Validate:
|
|
|
9830
10837
|
severity
|
|
9831
10838
|
- 'critical': the status obtained by calling `critical()`, most severe
|
|
9832
10839
|
|
|
9833
|
-
This method provides a dictionary of the
|
|
9834
|
-
|
|
9835
|
-
|
|
10840
|
+
This method provides a dictionary of the 'critical' status for each validation step. If the
|
|
10841
|
+
`scalar=True` argument is provided and `i=` is a scalar, the value is returned as a scalar
|
|
10842
|
+
instead of a dictionary.
|
|
9836
10843
|
|
|
9837
10844
|
Parameters
|
|
9838
10845
|
----------
|
|
9839
10846
|
i
|
|
9840
|
-
The validation step number(s) from which the
|
|
10847
|
+
The validation step number(s) from which the 'critical' status is obtained. Can be
|
|
9841
10848
|
provided as a list of integers or a single integer. If `None`, all steps are included.
|
|
9842
10849
|
scalar
|
|
9843
10850
|
If `True` and `i=` is a scalar, return the value as a scalar instead of a dictionary.
|
|
@@ -9845,7 +10852,7 @@ class Validate:
|
|
|
9845
10852
|
Returns
|
|
9846
10853
|
-------
|
|
9847
10854
|
dict[int, bool] | bool
|
|
9848
|
-
A dictionary of the
|
|
10855
|
+
A dictionary of the 'critical' status for each validation step or a scalar value.
|
|
9849
10856
|
|
|
9850
10857
|
Examples
|
|
9851
10858
|
--------
|
|
@@ -9924,11 +10931,13 @@ class Validate:
|
|
|
9924
10931
|
Get the rows that failed for each validation step.
|
|
9925
10932
|
|
|
9926
10933
|
After the [`interrogate()`](`pointblank.Validate.interrogate`) method has been called, the
|
|
9927
|
-
`get_data_extracts()` method can be used to extract the rows that failed in each
|
|
9928
|
-
validation step (e.g.,
|
|
9929
|
-
|
|
9930
|
-
|
|
9931
|
-
|
|
10934
|
+
`get_data_extracts()` method can be used to extract the rows that failed in each
|
|
10935
|
+
column-value or row-based validation step (e.g.,
|
|
10936
|
+
[`col_vals_gt()`](`pointblank.Validate.col_vals_gt`),
|
|
10937
|
+
[`rows_distinct()`](`pointblank.Validate.rows_distinct`), etc.). The method returns a
|
|
10938
|
+
dictionary of tables containing the rows that failed in every validation step. If
|
|
10939
|
+
`frame=True` and `i=` is a scalar, the value is conveniently returned as a table (forgoing
|
|
10940
|
+
the dictionary structure).
|
|
9932
10941
|
|
|
9933
10942
|
Parameters
|
|
9934
10943
|
----------
|
|
@@ -9941,13 +10950,13 @@ class Validate:
|
|
|
9941
10950
|
Returns
|
|
9942
10951
|
-------
|
|
9943
10952
|
dict[int, FrameT | None] | FrameT | None
|
|
9944
|
-
A dictionary of tables containing the rows that failed in every
|
|
9945
|
-
step
|
|
10953
|
+
A dictionary of tables containing the rows that failed in every compatible validation
|
|
10954
|
+
step. Alternatively, it can be a DataFrame if `frame=True` and `i=` is a scalar.
|
|
9946
10955
|
|
|
9947
|
-
Validation Methods
|
|
9948
|
-
|
|
9949
|
-
The following validation methods
|
|
9950
|
-
failing test units.
|
|
10956
|
+
Compatible Validation Methods for Yielding Extracted Rows
|
|
10957
|
+
---------------------------------------------------------
|
|
10958
|
+
The following validation methods operate on column values and will have rows extracted when
|
|
10959
|
+
there are failing test units.
|
|
9951
10960
|
|
|
9952
10961
|
- [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`)
|
|
9953
10962
|
- [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
|
|
@@ -9962,11 +10971,20 @@ class Validate:
|
|
|
9962
10971
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
9963
10972
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
9964
10973
|
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
10974
|
+
- [`col_vals_expr()`](`pointblank.Validate.col_vals_expr`)
|
|
10975
|
+
- [`conjointly()`](`pointblank.Validate.conjointly`)
|
|
10976
|
+
|
|
10977
|
+
An extracted row for these validation methods means that a test unit failed for that row in
|
|
10978
|
+
the validation step.
|
|
10979
|
+
|
|
10980
|
+
These row-based validation methods will also have rows extracted should there be failing
|
|
10981
|
+
rows:
|
|
10982
|
+
|
|
9965
10983
|
- [`rows_distinct()`](`pointblank.Validate.rows_distinct`)
|
|
10984
|
+
- [`rows_complete()`](`pointblank.Validate.rows_complete`)
|
|
9966
10985
|
|
|
9967
|
-
|
|
9968
|
-
|
|
9969
|
-
understanding the nature of the failing test units.
|
|
10986
|
+
The extracted rows are a subset of the original table and are useful for further analysis
|
|
10987
|
+
or for understanding the nature of the failing test units.
|
|
9970
10988
|
|
|
9971
10989
|
Examples
|
|
9972
10990
|
--------
|
|
@@ -10222,10 +11240,10 @@ class Validate:
|
|
|
10222
11240
|
Get the data that passed or failed the validation steps.
|
|
10223
11241
|
|
|
10224
11242
|
Validation of the data is one thing but, sometimes, you want to use the best part of the
|
|
10225
|
-
input dataset for something else. The `get_sundered_data()` method works with a Validate
|
|
11243
|
+
input dataset for something else. The `get_sundered_data()` method works with a `Validate`
|
|
10226
11244
|
object that has been interrogated (i.e., the
|
|
10227
11245
|
[`interrogate()`](`pointblank.Validate.interrogate`) method was used). We can get either the
|
|
10228
|
-
'pass' data piece (rows with no failing test units across all
|
|
11246
|
+
'pass' data piece (rows with no failing test units across all column-value based validation
|
|
10229
11247
|
functions), or, the 'fail' data piece (rows with at least one failing test unit across the
|
|
10230
11248
|
same series of validations).
|
|
10231
11249
|
|
|
@@ -10234,7 +11252,7 @@ class Validate:
|
|
|
10234
11252
|
There are some caveats to sundering. The validation steps considered for this splitting will
|
|
10235
11253
|
only involve steps where:
|
|
10236
11254
|
|
|
10237
|
-
- of certain check types, where test units are cells checked
|
|
11255
|
+
- of certain check types, where test units are cells checked down a column (e.g., the
|
|
10238
11256
|
`col_vals_*()` methods)
|
|
10239
11257
|
- `active=` is not set to `False`
|
|
10240
11258
|
- `pre=` has not been given an expression for modifying the input table
|
|
@@ -10465,6 +11483,19 @@ class Validate:
|
|
|
10465
11483
|
# Get information on the input data table
|
|
10466
11484
|
tbl_info = _get_tbl_type(data=self.data)
|
|
10467
11485
|
|
|
11486
|
+
# If the table is a Polars one, determine if it's a LazyFrame
|
|
11487
|
+
if tbl_info == "polars":
|
|
11488
|
+
if _is_lazy_frame(self.data):
|
|
11489
|
+
tbl_info = "polars-lazy"
|
|
11490
|
+
|
|
11491
|
+
# Determine if the input table is a Narwhals DF
|
|
11492
|
+
if _is_narwhals_table(self.data):
|
|
11493
|
+
# Determine if the Narwhals table is a LazyFrame
|
|
11494
|
+
if _is_lazy_frame(self.data):
|
|
11495
|
+
tbl_info = "narwhals-lazy"
|
|
11496
|
+
else:
|
|
11497
|
+
tbl_info = "narwhals"
|
|
11498
|
+
|
|
10468
11499
|
# Get the thresholds object
|
|
10469
11500
|
thresholds = self.thresholds
|
|
10470
11501
|
|
|
@@ -10517,7 +11548,9 @@ class Validate:
|
|
|
10517
11548
|
# Create the label, table type, and thresholds HTML fragments
|
|
10518
11549
|
label_html = _create_label_html(label=self.label, start_time="")
|
|
10519
11550
|
table_type_html = _create_table_type_html(tbl_type=tbl_info, tbl_name=self.tbl_name)
|
|
10520
|
-
thresholds_html = _create_thresholds_html(
|
|
11551
|
+
thresholds_html = _create_thresholds_html(
|
|
11552
|
+
thresholds=thresholds, locale=locale, df_lib=df_lib
|
|
11553
|
+
)
|
|
10521
11554
|
|
|
10522
11555
|
# Compose the subtitle HTML fragment
|
|
10523
11556
|
combined_subtitle = (
|
|
@@ -10830,6 +11863,7 @@ class Validate:
|
|
|
10830
11863
|
interrogation_performed=interrogation_performed,
|
|
10831
11864
|
active=active,
|
|
10832
11865
|
locale=locale,
|
|
11866
|
+
df_lib=df_lib,
|
|
10833
11867
|
)
|
|
10834
11868
|
|
|
10835
11869
|
# ------------------------------------------------
|
|
@@ -10846,6 +11880,7 @@ class Validate:
|
|
|
10846
11880
|
interrogation_performed=interrogation_performed,
|
|
10847
11881
|
active=active,
|
|
10848
11882
|
locale=locale,
|
|
11883
|
+
df_lib=df_lib,
|
|
10849
11884
|
)
|
|
10850
11885
|
|
|
10851
11886
|
validation_info_dict["fail"] = _transform_passed_failed(
|
|
@@ -10854,6 +11889,7 @@ class Validate:
|
|
|
10854
11889
|
interrogation_performed=interrogation_performed,
|
|
10855
11890
|
active=active,
|
|
10856
11891
|
locale=locale,
|
|
11892
|
+
df_lib=df_lib,
|
|
10857
11893
|
)
|
|
10858
11894
|
|
|
10859
11895
|
# ------------------------------------------------
|
|
@@ -11033,7 +12069,9 @@ class Validate:
|
|
|
11033
12069
|
# Create the label, table type, and thresholds HTML fragments
|
|
11034
12070
|
label_html = _create_label_html(label=self.label, start_time=self.time_start)
|
|
11035
12071
|
table_type_html = _create_table_type_html(tbl_type=tbl_info, tbl_name=self.tbl_name)
|
|
11036
|
-
thresholds_html = _create_thresholds_html(
|
|
12072
|
+
thresholds_html = _create_thresholds_html(
|
|
12073
|
+
thresholds=thresholds, locale=locale, df_lib=df_lib
|
|
12074
|
+
)
|
|
11037
12075
|
|
|
11038
12076
|
# Compose the subtitle HTML fragment
|
|
11039
12077
|
combined_subtitle = (
|
|
@@ -11291,24 +12329,25 @@ class Validate:
|
|
|
11291
12329
|
Types of Step Reports
|
|
11292
12330
|
---------------------
|
|
11293
12331
|
The `get_step_report()` method produces a report based on the *type* of validation step.
|
|
11294
|
-
The following row-based validation methods will produce a
|
|
11295
|
-
|
|
12332
|
+
The following column-value or row-based validation step validation methods will produce a
|
|
12333
|
+
report that shows the rows of the data that failed:
|
|
11296
12334
|
|
|
11297
12335
|
- [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`)
|
|
12336
|
+
- [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
|
|
11298
12337
|
- [`col_vals_lt()`](`pointblank.Validate.col_vals_lt`)
|
|
12338
|
+
- [`col_vals_le()`](`pointblank.Validate.col_vals_le`)
|
|
11299
12339
|
- [`col_vals_eq()`](`pointblank.Validate.col_vals_eq`)
|
|
11300
12340
|
- [`col_vals_ne()`](`pointblank.Validate.col_vals_ne`)
|
|
11301
|
-
- [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
|
|
11302
|
-
- [`col_vals_le()`](`pointblank.Validate.col_vals_le`)
|
|
11303
12341
|
- [`col_vals_between()`](`pointblank.Validate.col_vals_between`)
|
|
11304
12342
|
- [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
|
|
11305
12343
|
- [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
|
|
11306
12344
|
- [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
|
|
11307
|
-
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
11308
12345
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
11309
12346
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
11310
|
-
- [`
|
|
12347
|
+
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
12348
|
+
- [`col_vals_expr()`](`pointblank.Validate.col_vals_expr`)
|
|
11311
12349
|
- [`conjointly()`](`pointblank.Validate.conjointly`)
|
|
12350
|
+
- [`rows_complete()`](`pointblank.Validate.rows_complete`)
|
|
11312
12351
|
|
|
11313
12352
|
The [`rows_distinct()`](`pointblank.Validate.rows_distinct`) validation step will produce a
|
|
11314
12353
|
report that shows duplicate rows (or duplicate values in one or a set of columns as defined
|
|
@@ -12835,20 +13874,78 @@ def _transform_eval(
|
|
|
12835
13874
|
return symbol_list
|
|
12836
13875
|
|
|
12837
13876
|
|
|
13877
|
+
def _format_numbers_with_gt(
|
|
13878
|
+
values: list[int], n_sigfig: int = 3, compact: bool = True, locale: str = "en"
|
|
13879
|
+
) -> list[str]:
|
|
13880
|
+
"""Format numbers using Great Tables GT object to avoid pandas dependency."""
|
|
13881
|
+
import polars as pl
|
|
13882
|
+
|
|
13883
|
+
# Create a single-column DataFrame with all values
|
|
13884
|
+
df = pl.DataFrame({"values": values})
|
|
13885
|
+
|
|
13886
|
+
# Create GT object and format the column
|
|
13887
|
+
gt_obj = GT(df).fmt_number(columns="values", n_sigfig=n_sigfig, compact=compact, locale=locale)
|
|
13888
|
+
|
|
13889
|
+
# Extract the formatted values using _get_column_of_values
|
|
13890
|
+
formatted_values = _get_column_of_values(gt_obj, column_name="values", context="html")
|
|
13891
|
+
|
|
13892
|
+
return formatted_values
|
|
13893
|
+
|
|
13894
|
+
|
|
13895
|
+
def _format_single_number_with_gt(
|
|
13896
|
+
value: int, n_sigfig: int = 3, compact: bool = True, locale: str = "en", df_lib=None
|
|
13897
|
+
) -> str:
|
|
13898
|
+
"""Format a single number using Great Tables GT object to avoid pandas dependency."""
|
|
13899
|
+
if df_lib is None:
|
|
13900
|
+
# Use library detection to select appropriate DataFrame library
|
|
13901
|
+
if _is_lib_present("polars"):
|
|
13902
|
+
import polars as pl
|
|
13903
|
+
|
|
13904
|
+
df_lib = pl
|
|
13905
|
+
elif _is_lib_present("pandas"):
|
|
13906
|
+
import pandas as pd
|
|
13907
|
+
|
|
13908
|
+
df_lib = pd
|
|
13909
|
+
else:
|
|
13910
|
+
raise ImportError("Neither Polars nor Pandas is available for formatting")
|
|
13911
|
+
|
|
13912
|
+
# Create a single-row, single-column DataFrame using the specified library
|
|
13913
|
+
df = df_lib.DataFrame({"value": [value]})
|
|
13914
|
+
|
|
13915
|
+
# Create GT object and format the column
|
|
13916
|
+
gt_obj = GT(df).fmt_number(columns="value", n_sigfig=n_sigfig, compact=compact, locale=locale)
|
|
13917
|
+
|
|
13918
|
+
# Extract the formatted value using _get_column_of_values
|
|
13919
|
+
formatted_values = _get_column_of_values(gt_obj, column_name="value", context="html")
|
|
13920
|
+
|
|
13921
|
+
return formatted_values[0] # Return the single formatted value
|
|
13922
|
+
|
|
13923
|
+
|
|
12838
13924
|
def _transform_test_units(
|
|
12839
|
-
test_units: list[int],
|
|
13925
|
+
test_units: list[int],
|
|
13926
|
+
interrogation_performed: bool,
|
|
13927
|
+
active: list[bool],
|
|
13928
|
+
locale: str,
|
|
13929
|
+
df_lib=None,
|
|
12840
13930
|
) -> list[str]:
|
|
12841
13931
|
# If no interrogation was performed, return a list of empty strings
|
|
12842
13932
|
if not interrogation_performed:
|
|
12843
13933
|
return ["" for _ in range(len(test_units))]
|
|
12844
13934
|
|
|
13935
|
+
# Define the helper function that'll format numbers safely with Great Tables
|
|
13936
|
+
def _format_number_safe(value: int) -> str:
|
|
13937
|
+
if df_lib is not None:
|
|
13938
|
+
# Use GT-based formatting to avoid Pandas dependency completely
|
|
13939
|
+
return _format_single_number_with_gt(
|
|
13940
|
+
value, n_sigfig=3, compact=True, locale=locale, df_lib=df_lib
|
|
13941
|
+
)
|
|
13942
|
+
else:
|
|
13943
|
+
# Fallback to the original behavior
|
|
13944
|
+
return str(vals.fmt_number(value, n_sigfig=3, compact=True, locale=locale)[0])
|
|
13945
|
+
|
|
12845
13946
|
return [
|
|
12846
13947
|
(
|
|
12847
|
-
(
|
|
12848
|
-
str(test_units[i])
|
|
12849
|
-
if test_units[i] < 10000
|
|
12850
|
-
else str(vals.fmt_number(test_units[i], n_sigfig=3, compact=True, locale=locale)[0])
|
|
12851
|
-
)
|
|
13948
|
+
(str(test_units[i]) if test_units[i] < 10000 else _format_number_safe(test_units[i]))
|
|
12852
13949
|
if active[i]
|
|
12853
13950
|
else "—"
|
|
12854
13951
|
)
|
|
@@ -12856,8 +13953,43 @@ def _transform_test_units(
|
|
|
12856
13953
|
]
|
|
12857
13954
|
|
|
12858
13955
|
|
|
12859
|
-
def _fmt_lg(value: int, locale: str) -> str:
|
|
12860
|
-
|
|
13956
|
+
def _fmt_lg(value: int, locale: str, df_lib=None) -> str:
|
|
13957
|
+
if df_lib is not None:
|
|
13958
|
+
# Use GT-based formatting if a DataFrame library is provided
|
|
13959
|
+
return _format_single_number_with_gt(
|
|
13960
|
+
value, n_sigfig=3, compact=True, locale=locale, df_lib=df_lib
|
|
13961
|
+
)
|
|
13962
|
+
else:
|
|
13963
|
+
# Fallback to the original behavior
|
|
13964
|
+
return vals.fmt_number(value, n_sigfig=3, compact=True, locale=locale)[0]
|
|
13965
|
+
|
|
13966
|
+
|
|
13967
|
+
def _format_single_float_with_gt(
|
|
13968
|
+
value: float, decimals: int = 2, locale: str = "en", df_lib=None
|
|
13969
|
+
) -> str:
|
|
13970
|
+
if df_lib is None:
|
|
13971
|
+
# Use library detection to select appropriate DataFrame library
|
|
13972
|
+
if _is_lib_present("polars"):
|
|
13973
|
+
import polars as pl
|
|
13974
|
+
|
|
13975
|
+
df_lib = pl
|
|
13976
|
+
elif _is_lib_present("pandas"):
|
|
13977
|
+
import pandas as pd
|
|
13978
|
+
|
|
13979
|
+
df_lib = pd
|
|
13980
|
+
else:
|
|
13981
|
+
raise ImportError("Neither Polars nor Pandas is available for formatting")
|
|
13982
|
+
|
|
13983
|
+
# Create a single-row, single-column DataFrame using the specified library
|
|
13984
|
+
df = df_lib.DataFrame({"value": [value]})
|
|
13985
|
+
|
|
13986
|
+
# Create GT object and format the column
|
|
13987
|
+
gt_obj = GT(df).fmt_number(columns="value", decimals=decimals, locale=locale)
|
|
13988
|
+
|
|
13989
|
+
# Extract the formatted value using _get_column_of_values
|
|
13990
|
+
formatted_values = _get_column_of_values(gt_obj, column_name="value", context="html")
|
|
13991
|
+
|
|
13992
|
+
return formatted_values[0] # Return the single formatted value
|
|
12861
13993
|
|
|
12862
13994
|
|
|
12863
13995
|
def _transform_passed_failed(
|
|
@@ -12866,14 +13998,24 @@ def _transform_passed_failed(
|
|
|
12866
13998
|
interrogation_performed: bool,
|
|
12867
13999
|
active: list[bool],
|
|
12868
14000
|
locale: str,
|
|
14001
|
+
df_lib=None,
|
|
12869
14002
|
) -> list[str]:
|
|
12870
14003
|
if not interrogation_performed:
|
|
12871
14004
|
return ["" for _ in range(len(n_passed_failed))]
|
|
12872
14005
|
|
|
14006
|
+
# Helper function to format numbers safely
|
|
14007
|
+
def _format_float_safe(value: float) -> str:
|
|
14008
|
+
if df_lib is not None:
|
|
14009
|
+
# Use GT-based formatting to avoid Pandas dependency completely
|
|
14010
|
+
return _format_single_float_with_gt(value, decimals=2, locale=locale, df_lib=df_lib)
|
|
14011
|
+
else:
|
|
14012
|
+
# Fallback to the original behavior
|
|
14013
|
+
return vals.fmt_number(value, decimals=2, locale=locale)[0]
|
|
14014
|
+
|
|
12873
14015
|
passed_failed = [
|
|
12874
14016
|
(
|
|
12875
|
-
f"{n_passed_failed[i] if n_passed_failed[i] < 10000 else _fmt_lg(n_passed_failed[i], locale=locale)}"
|
|
12876
|
-
f"<br />{
|
|
14017
|
+
f"{n_passed_failed[i] if n_passed_failed[i] < 10000 else _fmt_lg(n_passed_failed[i], locale=locale, df_lib=df_lib)}"
|
|
14018
|
+
f"<br />{_format_float_safe(f_passed_failed[i])}"
|
|
12877
14019
|
if active[i]
|
|
12878
14020
|
else "—"
|
|
12879
14021
|
)
|
|
@@ -13084,41 +14226,122 @@ def _create_label_html(label: str | None, start_time: str) -> str:
|
|
|
13084
14226
|
)
|
|
13085
14227
|
|
|
13086
14228
|
|
|
13087
|
-
def
|
|
14229
|
+
def _format_single_integer_with_gt(value: int, locale: str = "en", df_lib=None) -> str:
|
|
14230
|
+
"""Format a single integer using Great Tables GT object to avoid pandas dependency."""
|
|
14231
|
+
if df_lib is None:
|
|
14232
|
+
# Use library detection to select appropriate DataFrame library
|
|
14233
|
+
if _is_lib_present("polars"):
|
|
14234
|
+
import polars as pl
|
|
14235
|
+
|
|
14236
|
+
df_lib = pl
|
|
14237
|
+
elif _is_lib_present("pandas"):
|
|
14238
|
+
import pandas as pd
|
|
14239
|
+
|
|
14240
|
+
df_lib = pd
|
|
14241
|
+
else:
|
|
14242
|
+
raise ImportError("Neither Polars nor Pandas is available for formatting")
|
|
14243
|
+
|
|
14244
|
+
# Create a single-row, single-column DataFrame using the specified library
|
|
14245
|
+
df = df_lib.DataFrame({"value": [value]})
|
|
14246
|
+
|
|
14247
|
+
# Create GT object and format the column
|
|
14248
|
+
gt_obj = GT(df).fmt_integer(columns="value", locale=locale)
|
|
14249
|
+
|
|
14250
|
+
# Extract the formatted value using _get_column_of_values
|
|
14251
|
+
formatted_values = _get_column_of_values(gt_obj, column_name="value", context="html")
|
|
14252
|
+
|
|
14253
|
+
return formatted_values[0] # Return the single formatted value
|
|
14254
|
+
|
|
14255
|
+
|
|
14256
|
+
def _format_single_float_with_gt_custom(
|
|
14257
|
+
value: float,
|
|
14258
|
+
decimals: int = 2,
|
|
14259
|
+
drop_trailing_zeros: bool = False,
|
|
14260
|
+
locale: str = "en",
|
|
14261
|
+
df_lib=None,
|
|
14262
|
+
) -> str:
|
|
14263
|
+
"""Format a single float with custom options using Great Tables GT object to avoid pandas dependency."""
|
|
14264
|
+
if df_lib is None:
|
|
14265
|
+
# Use library detection to select appropriate DataFrame library
|
|
14266
|
+
if _is_lib_present("polars"):
|
|
14267
|
+
import polars as pl
|
|
14268
|
+
|
|
14269
|
+
df_lib = pl
|
|
14270
|
+
elif _is_lib_present("pandas"):
|
|
14271
|
+
import pandas as pd
|
|
14272
|
+
|
|
14273
|
+
df_lib = pd
|
|
14274
|
+
else:
|
|
14275
|
+
raise ImportError("Neither Polars nor Pandas is available for formatting")
|
|
14276
|
+
|
|
14277
|
+
# Create a single-row, single-column DataFrame using the specified library
|
|
14278
|
+
df = df_lib.DataFrame({"value": [value]})
|
|
14279
|
+
|
|
14280
|
+
# Create GT object and format the column
|
|
14281
|
+
gt_obj = GT(df).fmt_number(
|
|
14282
|
+
columns="value", decimals=decimals, drop_trailing_zeros=drop_trailing_zeros, locale=locale
|
|
14283
|
+
)
|
|
14284
|
+
|
|
14285
|
+
# Extract the formatted value using _get_column_of_values
|
|
14286
|
+
formatted_values = _get_column_of_values(gt_obj, column_name="value", context="html")
|
|
14287
|
+
|
|
14288
|
+
return formatted_values[0] # Return the single formatted value
|
|
14289
|
+
|
|
14290
|
+
|
|
14291
|
+
def _create_thresholds_html(thresholds: Thresholds, locale: str, df_lib=None) -> str:
|
|
13088
14292
|
if thresholds == Thresholds():
|
|
13089
14293
|
return ""
|
|
13090
14294
|
|
|
14295
|
+
# Helper functions to format numbers safely
|
|
14296
|
+
def _format_number_safe(value: float, decimals: int, drop_trailing_zeros: bool = False) -> str:
|
|
14297
|
+
if df_lib is not None and value is not None:
|
|
14298
|
+
# Use GT-based formatting to avoid Pandas dependency completely
|
|
14299
|
+
return _format_single_float_with_gt_custom(
|
|
14300
|
+
value,
|
|
14301
|
+
decimals=decimals,
|
|
14302
|
+
drop_trailing_zeros=drop_trailing_zeros,
|
|
14303
|
+
locale=locale,
|
|
14304
|
+
df_lib=df_lib,
|
|
14305
|
+
)
|
|
14306
|
+
else:
|
|
14307
|
+
# Fallback to the original behavior
|
|
14308
|
+
return fmt_number(
|
|
14309
|
+
value, decimals=decimals, drop_trailing_zeros=drop_trailing_zeros, locale=locale
|
|
14310
|
+
)[0]
|
|
14311
|
+
|
|
14312
|
+
def _format_integer_safe(value: int) -> str:
|
|
14313
|
+
if df_lib is not None and value is not None:
|
|
14314
|
+
# Use GT-based formatting to avoid Pandas dependency completely
|
|
14315
|
+
return _format_single_integer_with_gt(value, locale=locale, df_lib=df_lib)
|
|
14316
|
+
else:
|
|
14317
|
+
# Fallback to the original behavior
|
|
14318
|
+
return fmt_integer(value, locale=locale)[0]
|
|
14319
|
+
|
|
13091
14320
|
warning = (
|
|
13092
|
-
|
|
13093
|
-
thresholds.warning_fraction, decimals=3, drop_trailing_zeros=True, locale=locale
|
|
13094
|
-
)[0]
|
|
14321
|
+
_format_number_safe(thresholds.warning_fraction, decimals=3, drop_trailing_zeros=True)
|
|
13095
14322
|
if thresholds.warning_fraction is not None
|
|
13096
14323
|
else (
|
|
13097
|
-
|
|
14324
|
+
_format_integer_safe(thresholds.warning_count)
|
|
13098
14325
|
if thresholds.warning_count is not None
|
|
13099
14326
|
else "—"
|
|
13100
14327
|
)
|
|
13101
14328
|
)
|
|
13102
14329
|
|
|
13103
14330
|
error = (
|
|
13104
|
-
|
|
13105
|
-
0
|
|
13106
|
-
]
|
|
14331
|
+
_format_number_safe(thresholds.error_fraction, decimals=3, drop_trailing_zeros=True)
|
|
13107
14332
|
if thresholds.error_fraction is not None
|
|
13108
14333
|
else (
|
|
13109
|
-
|
|
14334
|
+
_format_integer_safe(thresholds.error_count)
|
|
13110
14335
|
if thresholds.error_count is not None
|
|
13111
14336
|
else "—"
|
|
13112
14337
|
)
|
|
13113
14338
|
)
|
|
13114
14339
|
|
|
13115
14340
|
critical = (
|
|
13116
|
-
|
|
13117
|
-
thresholds.critical_fraction, decimals=3, drop_trailing_zeros=True, locale=locale
|
|
13118
|
-
)[0]
|
|
14341
|
+
_format_number_safe(thresholds.critical_fraction, decimals=3, drop_trailing_zeros=True)
|
|
13119
14342
|
if thresholds.critical_fraction is not None
|
|
13120
14343
|
else (
|
|
13121
|
-
|
|
14344
|
+
_format_integer_safe(thresholds.critical_count)
|
|
13122
14345
|
if thresholds.critical_count is not None
|
|
13123
14346
|
else "—"
|
|
13124
14347
|
)
|