pointblank 0.12.1__py3-none-any.whl → 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1388,6 +1388,17 @@ class RowsDistinct:
1388
1388
  def get_test_results(self):
1389
1389
  return self.test_unit_res
1390
1390
 
1391
+ def test(self):
1392
+ # Get the number of failing test units by counting instances of `False` in the `pb_is_good_`
1393
+ # column and then determine if the test passes overall by comparing the number of failing
1394
+ # test units to the threshold for failing test units
1395
+
1396
+ results_list = nw.from_native(self.test_unit_res)["pb_is_good_"].to_list()
1397
+
1398
+ return _threshold_check(
1399
+ failing_test_units=results_list.count(False), threshold=self.threshold
1400
+ )
1401
+
1391
1402
 
1392
1403
  @dataclass
1393
1404
  class RowsComplete:
@@ -2029,23 +2040,6 @@ def _column_has_null_values(table: FrameT, column: str) -> bool:
2029
2040
  return True
2030
2041
 
2031
2042
 
2032
- def _check_nulls_across_columns_ibis(table, columns_subset):
2033
- # Get all column names from the table
2034
- column_names = columns_subset if columns_subset else table.columns
2035
-
2036
- # Build the expression by combining each column's isnull() with OR operations
2037
- null_expr = functools.reduce(
2038
- lambda acc, col: acc | table[col].isnull() if acc is not None else table[col].isnull(),
2039
- column_names,
2040
- None,
2041
- )
2042
-
2043
- # Add the expression as a new column to the table
2044
- result = table.mutate(_any_is_null_=null_expr)
2045
-
2046
- return result
2047
-
2048
-
2049
2043
  def _check_nulls_across_columns_nw(table, columns_subset):
2050
2044
  # Get all column names from the table
2051
2045
  column_names = columns_subset if columns_subset else table.columns
@@ -9798,7 +9798,7 @@ validation workflows. The `yaml_interrogate()` function can be used to run a val
9798
9798
  YAML strings or files. The `validate_yaml()` function checks if the YAML configuration
9799
9799
  passes its own validity checks.
9800
9800
 
9801
- yaml_interrogate(yaml: 'Union[str, Path]') -> 'Validate'
9801
+ yaml_interrogate(yaml: 'Union[str, Path]', set_tbl: 'Union[FrameT, Any, None]' = None) -> 'Validate'
9802
9802
  Execute a YAML-based validation workflow.
9803
9803
 
9804
9804
  This is the main entry point for YAML-based validation workflows. It takes YAML configuration
@@ -9813,13 +9813,20 @@ Execute a YAML-based validation workflow.
9813
9813
  yaml
9814
9814
  YAML configuration as string or file path. Can be: (1) a YAML string containing the
9815
9815
  validation configuration, or (2) a Path object or string path to a YAML file.
9816
+ set_tbl
9817
+ An optional table to override the table specified in the YAML configuration. This allows you
9818
+ to apply a YAML-defined validation workflow to a different table than what's specified in
9819
+ the configuration. If provided, this table will replace the table defined in the YAML's
9820
+ `tbl` field before executing the validation workflow. This can be any supported table type
9821
+ including DataFrame objects, Ibis table objects, CSV file paths, Parquet file paths, GitHub
9822
+ URLs, or database connection strings.
9816
9823
 
9817
9824
  Returns
9818
9825
  -------
9819
9826
  Validate
9820
- An instance of the `Validate` class that has been configured based on the YAML input.
9821
- This object contains the results of the validation steps defined in the YAML configuration.
9822
- It includes metadata like table name, label, language, and thresholds if specified.
9827
+ An instance of the `Validate` class that has been configured based on the YAML input. This
9828
+ object contains the results of the validation steps defined in the YAML configuration. It
9829
+ includes metadata like table name, label, language, and thresholds if specified.
9823
9830
 
9824
9831
  Raises
9825
9832
  ------
@@ -9918,6 +9925,44 @@ Execute a YAML-based validation workflow.
9918
9925
  This approach is particularly useful for storing validation configurations as part of your data
9919
9926
  pipeline or version control system, allowing you to maintain validation rules alongside your
9920
9927
  code.
9928
+
9929
+ ### Using `set_tbl=` to Override the Table
9930
+
9931
+ The `set_tbl=` parameter allows you to override the table specified in the YAML configuration.
9932
+ This is useful when you have a template validation workflow but want to apply it to different
9933
+ tables:
9934
+
9935
+ ```python
9936
+ import polars as pl
9937
+
9938
+ # Create a test table with similar structure to small_table
9939
+ test_table = pl.DataFrame({
9940
+ "date": ["2023-01-01", "2023-01-02", "2023-01-03"],
9941
+ "a": [1, 2, 3],
9942
+ "b": ["1-abc-123", "2-def-456", "3-ghi-789"],
9943
+ "d": [150, 200, 250]
9944
+ })
9945
+
9946
+ # Use the same YAML config but apply it to our test table
9947
+ yaml_config = '''
9948
+ tbl: small_table # This will be overridden
9949
+ tbl_name: Test Table # This name will be used
9950
+ steps:
9951
+ - col_exists:
9952
+ columns: [date, a, b, d]
9953
+ - col_vals_gt:
9954
+ columns: [d]
9955
+ value: 100
9956
+ '''
9957
+
9958
+ # Execute with table override
9959
+ result = pb.yaml_interrogate(yaml_config, set_tbl=test_table)
9960
+ print(f"Validation applied to: {result.tbl_name}")
9961
+ result
9962
+ ```
9963
+
9964
+ This feature makes YAML configurations more reusable and flexible, allowing you to define
9965
+ validation logic once and apply it to multiple similar tables.
9921
9966
 
9922
9967
 
9923
9968
  validate_yaml(yaml: 'Union[str, Path]') -> 'None'
pointblank/validate.py CHANGED
@@ -740,9 +740,9 @@ def _process_data(data: FrameT | Any) -> FrameT | Any:
740
740
  """
741
741
  Centralized data processing pipeline that handles all supported input types.
742
742
 
743
- This function consolidates the data processing pipeline used across multiple
744
- classes and functions in Pointblank. It processes data through a consistent
745
- sequence of transformations to handle different data source types.
743
+ This function consolidates the data processing pipeline used across multiple classes and
744
+ functions in Pointblank. It processes data through a consistent sequence of transformations to
745
+ handle different data source types.
746
746
 
747
747
  The processing order is important:
748
748
 
@@ -829,7 +829,9 @@ def _process_github_url(data: FrameT | Any) -> FrameT | Any:
829
829
  # Parse the URL to check if it's a GitHub URL
830
830
  try:
831
831
  parsed = urlparse(data)
832
- except Exception:
832
+ except ValueError:
833
+ # urlparse can raise ValueError for malformed URLs (e.g., invalid IPv6)
834
+ # Return original data as it's likely not a GitHub URL we can process
833
835
  return data
834
836
 
835
837
  # Check if it's a GitHub URL (standard or raw)
@@ -881,13 +883,10 @@ def _process_github_url(data: FrameT | Any) -> FrameT | Any:
881
883
  else: # .parquet
882
884
  return _process_parquet_input(tmp_file_path)
883
885
 
884
- except Exception:
886
+ except Exception: # pragma: no cover
885
887
  # If download or processing fails, return original data
886
888
  return data
887
889
 
888
- except Exception as e:
889
- raise RuntimeError(f"Failed to download or process GitHub file from {raw_url}: {e}") from e
890
-
891
890
 
892
891
  def _process_connection_string(data: FrameT | Any) -> FrameT | Any:
893
892
  """
@@ -943,8 +942,7 @@ def _process_csv_input(data: FrameT | Any) -> FrameT | Any:
943
942
  if not csv_path.exists():
944
943
  raise FileNotFoundError(f"CSV file not found: {csv_path}")
945
944
 
946
- # Determine which library to use for reading CSV
947
- # Prefer Polars, fallback to Pandas
945
+ # Determine which library to use for reading CSV: prefer Polars but fallback to Pandas
948
946
  if _is_lib_present(lib_name="polars"):
949
947
  try:
950
948
  import polars as pl
@@ -956,7 +954,7 @@ def _process_csv_input(data: FrameT | Any) -> FrameT | Any:
956
954
  import pandas as pd
957
955
 
958
956
  return pd.read_csv(csv_path)
959
- else:
957
+ else: # pragma: no cover
960
958
  raise RuntimeError(
961
959
  f"Failed to read CSV file with Polars: {e}. "
962
960
  "Pandas is not available as fallback."
@@ -1093,7 +1091,7 @@ def _process_parquet_input(data: FrameT | Any) -> FrameT | Any:
1093
1091
  # Multiple files: concatenate them
1094
1092
  dfs = [pd.read_parquet(path) for path in parquet_paths]
1095
1093
  return pd.concat(dfs, ignore_index=True)
1096
- else:
1094
+ else: # pragma: no cover
1097
1095
  raise RuntimeError(
1098
1096
  f"Failed to read Parquet file(s) with Polars: {e}. "
1099
1097
  "Pandas is not available as fallback."
@@ -1615,24 +1613,9 @@ def _generate_display_table(
1615
1613
  # This is used to highlight these values in the table
1616
1614
  if df_lib_name_gt == "polars":
1617
1615
  none_values = {k: data[k].is_null().to_list() for k in col_names}
1618
- elif df_lib_name_gt == "pyspark":
1619
- # For PySpark, check if data has been converted to pandas already
1620
- if hasattr(data, "isnull"):
1621
- # Data has been converted to pandas
1622
- none_values = {k: data[k].isnull() for k in col_names}
1623
- else:
1624
- # Data is still a PySpark DataFrame - use narwhals
1625
- import narwhals as nw
1626
-
1627
- df_nw = nw.from_native(data)
1628
- none_values = {}
1629
- for col in col_names:
1630
- # Get null mask, collect to pandas, then convert to list
1631
- null_mask = (
1632
- df_nw.select(nw.col(col).is_null()).collect().to_pandas().iloc[:, 0].tolist()
1633
- )
1634
- none_values[col] = null_mask
1635
1616
  else:
1617
+ # PySpark data has been converted to Pandas by this point so the 'isnull()'
1618
+ # method can be used
1636
1619
  none_values = {k: data[k].isnull() for k in col_names}
1637
1620
 
1638
1621
  none_values = [(k, i) for k, v in none_values.items() for i, val in enumerate(v) if val]
@@ -1980,59 +1963,68 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
1980
1963
 
1981
1964
  # Use the `row_ranges` list of lists to query, for each column, the proportion of missing
1982
1965
  # values in each 'sector' of the table (a sector is a range of rows)
1983
- if df_lib_name_gt == "polars":
1984
- missing_vals = {
1985
- col: [
1986
- (
1987
- data[(cut_points[i - 1] if i > 0 else 0) : cut_points[i]][col]
1988
- .isnull()
1989
- .sum()
1990
- .to_polars()
1991
- / (cut_points[i] - (cut_points[i - 1] if i > 0 else 0))
1992
- * 100
1993
- if cut_points[i] > (cut_points[i - 1] if i > 0 else 0)
1994
- else 0
1995
- )
1996
- for i in range(len(cut_points))
1997
- ]
1998
- + [
1999
- (
2000
- data[cut_points[-1] : n_rows][col].isnull().sum().to_polars()
2001
- / (n_rows - cut_points[-1])
2002
- * 100
2003
- if n_rows > cut_points[-1]
2004
- else 0
2005
- )
2006
- ]
2007
- for col in data.columns
2008
- }
1966
+ def _calculate_missing_proportions(use_polars_conversion: bool = False):
1967
+ """
1968
+ Calculate missing value proportions for each column and sector.
1969
+
1970
+ Parameters
1971
+ ----------
1972
+ use_polars_conversion
1973
+ If True, use `.to_polars()` for conversions, otherwise use `.to_pandas()`
1974
+ """
1975
+ missing_vals = {}
1976
+ for col in data.columns:
1977
+ col_missing_props = []
1978
+
1979
+ # Calculate missing value proportions for each sector
1980
+ for i in range(len(cut_points)):
1981
+ start_row = cut_points[i - 1] if i > 0 else 0
1982
+ end_row = cut_points[i]
1983
+ sector_size = end_row - start_row
1984
+
1985
+ if sector_size > 0:
1986
+ sector_data = data[start_row:end_row][col]
1987
+ null_sum = sector_data.isnull().sum()
1988
+
1989
+ # Apply the appropriate conversion method
1990
+ if use_polars_conversion:
1991
+ null_sum_converted = null_sum.to_polars()
1992
+ else:
1993
+ null_sum_converted = null_sum.to_pandas()
2009
1994
 
1995
+ missing_prop = (null_sum_converted / sector_size) * 100
1996
+ col_missing_props.append(missing_prop)
1997
+ else:
1998
+ col_missing_props.append(0)
1999
+
2000
+ # Handle the final sector (after last cut point)
2001
+ if n_rows > cut_points[-1]:
2002
+ start_row = cut_points[-1]
2003
+ sector_size = n_rows - start_row
2004
+
2005
+ sector_data = data[start_row:n_rows][col]
2006
+ null_sum = sector_data.isnull().sum()
2007
+
2008
+ # Apply the appropriate conversion method
2009
+ if use_polars_conversion:
2010
+ null_sum_converted = null_sum.to_polars()
2011
+ else:
2012
+ null_sum_converted = null_sum.to_pandas()
2013
+
2014
+ missing_prop = (null_sum_converted / sector_size) * 100
2015
+ col_missing_props.append(missing_prop)
2016
+ else:
2017
+ col_missing_props.append(0) # pragma: no cover
2018
+
2019
+ missing_vals[col] = col_missing_props
2020
+
2021
+ return missing_vals
2022
+
2023
+ # Use the helper function based on the DataFrame library
2024
+ if df_lib_name_gt == "polars":
2025
+ missing_vals = _calculate_missing_proportions(use_polars_conversion=True)
2010
2026
  else:
2011
- missing_vals = {
2012
- col: [
2013
- (
2014
- data[(cut_points[i - 1] if i > 0 else 0) : cut_points[i]][col]
2015
- .isnull()
2016
- .sum()
2017
- .to_pandas()
2018
- / (cut_points[i] - (cut_points[i - 1] if i > 0 else 0))
2019
- * 100
2020
- if cut_points[i] > (cut_points[i - 1] if i > 0 else 0)
2021
- else 0
2022
- )
2023
- for i in range(len(cut_points))
2024
- ]
2025
- + [
2026
- (
2027
- data[cut_points[-1] : n_rows][col].isnull().sum().to_pandas()
2028
- / (n_rows - cut_points[-1])
2029
- * 100
2030
- if n_rows > cut_points[-1]
2031
- else 0
2032
- )
2033
- ]
2034
- for col in data.columns
2035
- }
2027
+ missing_vals = _calculate_missing_proportions(use_polars_conversion=False)
2036
2028
 
2037
2029
  # Pivot the `missing_vals` dictionary to create a table with the missing value proportions
2038
2030
  missing_vals = {
@@ -2053,16 +2045,17 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
2053
2045
  # Get the column names from the table
2054
2046
  col_names = list(data.columns)
2055
2047
 
2056
- # Iterate over the cut points and get the proportion of missing values in each 'sector'
2057
- # for each column
2058
- if "polars" in tbl_type:
2059
- # Polars case
2048
+ # Helper function for DataFrame missing value calculation (Polars/Pandas)
2049
+ def _calculate_missing_proportions_dataframe(is_polars=False):
2050
+ null_method = "is_null" if is_polars else "isnull"
2051
+
2060
2052
  missing_vals = {
2061
2053
  col: [
2062
2054
  (
2063
- data[(cut_points[i - 1] if i > 0 else 0) : cut_points[i]][col]
2064
- .is_null()
2065
- .sum()
2055
+ getattr(
2056
+ data[(cut_points[i - 1] if i > 0 else 0) : cut_points[i]][col],
2057
+ null_method,
2058
+ )().sum()
2066
2059
  / (cut_points[i] - (cut_points[i - 1] if i > 0 else 0))
2067
2060
  * 100
2068
2061
  if cut_points[i] > (cut_points[i - 1] if i > 0 else 0)
@@ -2072,7 +2065,7 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
2072
2065
  ]
2073
2066
  + [
2074
2067
  (
2075
- data[cut_points[-1] : n_rows][col].is_null().sum()
2068
+ getattr(data[cut_points[-1] : n_rows][col], null_method)().sum()
2076
2069
  / (n_rows - cut_points[-1])
2077
2070
  * 100
2078
2071
  if n_rows > cut_points[-1]
@@ -2082,7 +2075,8 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
2082
2075
  for col in data.columns
2083
2076
  }
2084
2077
 
2085
- missing_vals = {
2078
+ # Transform to the expected format
2079
+ formatted_missing_vals = {
2086
2080
  "columns": list(missing_vals.keys()),
2087
2081
  **{
2088
2082
  str(i + 1): [missing_vals[col][i] for col in missing_vals.keys()]
@@ -2091,48 +2085,25 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
2091
2085
  }
2092
2086
 
2093
2087
  # Get a dictionary of counts of missing values in each column
2094
- missing_val_counts = {col: data[col].is_null().sum() for col in data.columns}
2095
-
2096
- if "pandas" in tbl_type:
2097
- missing_vals = {
2098
- col: [
2099
- (
2100
- data[(cut_points[i - 1] if i > 0 else 0) : cut_points[i]][col]
2101
- .isnull()
2102
- .sum()
2103
- / (cut_points[i] - (cut_points[i - 1] if i > 0 else 0))
2104
- * 100
2105
- if cut_points[i] > (cut_points[i - 1] if i > 0 else 0)
2106
- else 0
2107
- )
2108
- for i in range(len(cut_points))
2109
- ]
2110
- + [
2111
- (
2112
- data[cut_points[-1] : n_rows][col].isnull().sum()
2113
- / (n_rows - cut_points[-1])
2114
- * 100
2115
- if n_rows > cut_points[-1]
2116
- else 0
2117
- )
2118
- ]
2119
- for col in data.columns
2088
+ missing_val_counts = {
2089
+ col: getattr(data[col], null_method)().sum() for col in data.columns
2120
2090
  }
2121
2091
 
2122
- # Pivot the `missing_vals` dictionary to create a table with the missing
2123
- # value proportions
2124
- missing_vals = {
2125
- "columns": list(missing_vals.keys()),
2126
- **{
2127
- str(i + 1): [missing_vals[col][i] for col in missing_vals.keys()]
2128
- for i in range(len(cut_points) + 1)
2129
- },
2130
- }
2092
+ return formatted_missing_vals, missing_val_counts
2131
2093
 
2132
- # Get a dictionary of counts of missing values in each column
2133
- missing_val_counts = {col: data[col].isnull().sum() for col in data.columns}
2094
+ # Iterate over the cut points and get the proportion of missing values in each 'sector'
2095
+ # for each column
2096
+ if "polars" in tbl_type:
2097
+ missing_vals, missing_val_counts = _calculate_missing_proportions_dataframe(
2098
+ is_polars=True
2099
+ )
2100
+
2101
+ elif "pandas" in tbl_type:
2102
+ missing_vals, missing_val_counts = _calculate_missing_proportions_dataframe(
2103
+ is_polars=False
2104
+ )
2134
2105
 
2135
- if "pyspark" in tbl_type:
2106
+ elif "pyspark" in tbl_type:
2136
2107
  from pyspark.sql.functions import col as pyspark_col
2137
2108
 
2138
2109
  # PySpark implementation for missing values calculation
@@ -2164,7 +2135,7 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
2164
2135
  missing_prop = (null_count / sector_size) * 100
2165
2136
  col_missing_props.append(missing_prop)
2166
2137
  else:
2167
- col_missing_props.append(0)
2138
+ col_missing_props.append(0) # pragma: no cover
2168
2139
 
2169
2140
  # Handle the final sector (after last cut point)
2170
2141
  if n_rows > cut_points[-1]:
@@ -2184,7 +2155,7 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
2184
2155
  missing_prop = (null_count / sector_size) * 100
2185
2156
  col_missing_props.append(missing_prop)
2186
2157
  else:
2187
- col_missing_props.append(0)
2158
+ col_missing_props.append(0) # pragma: no cover
2188
2159
 
2189
2160
  missing_vals[col_name] = col_missing_props
2190
2161
 
@@ -2623,7 +2594,7 @@ def get_column_count(data: FrameT | Any) -> int:
2623
2594
  except Exception:
2624
2595
  # Fallback for unsupported types
2625
2596
  if "pandas" in str(type(data)):
2626
- return data.shape[1]
2597
+ return data.shape[1] # pragma: no cover
2627
2598
  else:
2628
2599
  raise ValueError("The input table type supplied in `data=` is not supported.")
2629
2600
 
@@ -2793,14 +2764,14 @@ def get_row_count(data: FrameT | Any) -> int:
2793
2764
  if hasattr(df_nw, "shape"):
2794
2765
  return df_nw.shape[0]
2795
2766
  elif hasattr(df_nw, "height"):
2796
- return df_nw.height
2797
- else:
2767
+ return df_nw.height # pragma: no cover
2768
+ else: # pragma: no cover
2798
2769
  raise ValueError("Unable to determine row count from Narwhals DataFrame")
2799
2770
  except Exception:
2800
2771
  # Fallback for types that don't work with Narwhals
2801
- if "pandas" in str(type(data)):
2772
+ if "pandas" in str(type(data)): # pragma: no cover
2802
2773
  return data.shape[0]
2803
- elif "pyspark" in str(type(data)):
2774
+ elif "pyspark" in str(type(data)): # pragma: no cover
2804
2775
  return data.count()
2805
2776
  else:
2806
2777
  raise ValueError("The input table type supplied in `data=` is not supported.")
@@ -3019,7 +2990,7 @@ def connect_to_table(connection_string: str) -> Any:
3019
2990
  # Get list of available tables
3020
2991
  try:
3021
2992
  available_tables = conn.list_tables()
3022
- except Exception:
2993
+ except Exception: # pragma: no cover
3023
2994
  available_tables = []
3024
2995
 
3025
2996
  conn.disconnect()
@@ -3064,7 +3035,7 @@ def connect_to_table(connection_string: str) -> Any:
3064
3035
  }
3065
3036
 
3066
3037
  # Check if this is a missing backend dependency
3067
- for backend, install_cmd in backend_install_map.items():
3038
+ for backend, install_cmd in backend_install_map.items(): # pragma: no cover
3068
3039
  if backend in error_str and ("not found" in error_str or "no module" in error_str):
3069
3040
  raise ConnectionError(
3070
3041
  f"Missing {backend.upper()} backend for Ibis. Install it with:\n"
@@ -3081,7 +3052,7 @@ def connect_to_table(connection_string: str) -> Any:
3081
3052
  ) from e
3082
3053
 
3083
3054
  # Generic connection error
3084
- raise ConnectionError(
3055
+ raise ConnectionError( # pragma: no cover
3085
3056
  f"Failed to connect to database using connection string: {connection_string}\n"
3086
3057
  f"Error: {e}\n\n"
3087
3058
  f"No table specified. Use the format: {connection_string}::TABLE_NAME"
@@ -3090,7 +3061,7 @@ def connect_to_table(connection_string: str) -> Any:
3090
3061
  # Split connection string and table name
3091
3062
  try:
3092
3063
  base_connection, table_name = connection_string.rsplit("::", 1)
3093
- except ValueError:
3064
+ except ValueError: # pragma: no cover
3094
3065
  raise ValueError(f"Invalid connection string format: {connection_string}")
3095
3066
 
3096
3067
  # Connect to database and get table
@@ -3124,7 +3095,7 @@ def connect_to_table(connection_string: str) -> Any:
3124
3095
  # Check if table doesn't exist
3125
3096
  if "table" in error_str and ("not found" in error_str or "does not exist" in error_str):
3126
3097
  # Try to get available tables for helpful message
3127
- try:
3098
+ try: # pragma: no cover
3128
3099
  available_tables = conn.list_tables()
3129
3100
  if available_tables:
3130
3101
  table_list = "\n".join(f" - {table}" for table in available_tables)
@@ -3758,6 +3729,141 @@ class Validate:
3758
3729
 
3759
3730
  self.validation_info = []
3760
3731
 
3732
+ def set_tbl(
3733
+ self,
3734
+ tbl: FrameT | Any,
3735
+ tbl_name: str | None = None,
3736
+ label: str | None = None,
3737
+ ) -> Validate:
3738
+ """
3739
+ Set or replace the table associated with the Validate object.
3740
+
3741
+ This method allows you to replace the table associated with a Validate object with a
3742
+ different (but presumably similar) table. This is useful when you want to apply the same
3743
+ validation plan to multiple tables or when you have a validation workflow defined but want
3744
+ to swap in a different data source.
3745
+
3746
+ Parameters
3747
+ ----------
3748
+ tbl
3749
+ The table to replace the existing table with. This can be any supported table type
3750
+ including DataFrame objects, Ibis table objects, CSV file paths, Parquet file paths,
3751
+ GitHub URLs, or database connection strings. The same table type constraints apply as in
3752
+ the `Validate` constructor.
3753
+ tbl_name
3754
+ An optional name to assign to the new input table object. If no value is provided, the
3755
+ existing table name will be retained.
3756
+ label
3757
+ An optional label for the validation plan. If no value is provided, the existing label
3758
+ will be retained.
3759
+
3760
+ Returns
3761
+ -------
3762
+ Validate
3763
+ A new `Validate` object with the replacement table.
3764
+
3765
+ When to Use
3766
+ -----------
3767
+ The `set_tbl()` method is particularly useful in scenarios where you have:
3768
+
3769
+ - multiple similar tables that need the same validation checks
3770
+ - a template validation workflow that should be applied to different data sources
3771
+ - YAML-defined validations where you want to override the table specified in the YAML
3772
+
3773
+ The `set_tbl()` method creates a copy of the validation object with the new table, so the
3774
+ original validation object remains unchanged. This allows you to reuse validation plans
3775
+ across multiple tables without interference.
3776
+
3777
+ Examples
3778
+ --------
3779
+ ```{python}
3780
+ #| echo: false
3781
+ #| output: false
3782
+ import pointblank as pb
3783
+ pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
3784
+ ```
3785
+ We will first create two similar tables for our future validation plans.
3786
+
3787
+ ```{python}
3788
+ import pointblank as pb
3789
+ import polars as pl
3790
+
3791
+ # Create two similar tables
3792
+ table_1 = pl.DataFrame({
3793
+ "x": [1, 2, 3, 4, 5],
3794
+ "y": [5, 4, 3, 2, 1],
3795
+ "z": ["a", "b", "c", "d", "e"]
3796
+ })
3797
+
3798
+ table_2 = pl.DataFrame({
3799
+ "x": [2, 4, 6, 8, 10],
3800
+ "y": [10, 8, 6, 4, 2],
3801
+ "z": ["f", "g", "h", "i", "j"]
3802
+ })
3803
+ ```
3804
+
3805
+ Create a validation plan with the first table.
3806
+
3807
+ ```{python}
3808
+ validation_table_1 = (
3809
+ pb.Validate(
3810
+ data=table_1,
3811
+ tbl_name="Table 1",
3812
+ label="Validation applied to the first table"
3813
+ )
3814
+ .col_vals_gt(columns="x", value=0)
3815
+ .col_vals_lt(columns="y", value=10)
3816
+ )
3817
+ ```
3818
+
3819
+ Now apply the same validation plan to the second table.
3820
+
3821
+ ```{python}
3822
+ validation_table_2 = (
3823
+ validation_table_1
3824
+ .set_tbl(
3825
+ tbl=table_2,
3826
+ tbl_name="Table 2",
3827
+ label="Validation applied to the second table"
3828
+ )
3829
+ )
3830
+ ```
3831
+
3832
+ Here is the interrogation of the first table:
3833
+
3834
+ ```{python}
3835
+ validation_table_1.interrogate()
3836
+ ```
3837
+
3838
+ And the second table:
3839
+
3840
+ ```{python}
3841
+ validation_table_2.interrogate()
3842
+ ```
3843
+ """
3844
+ from copy import deepcopy
3845
+
3846
+ # Create a deep copy of the current Validate object
3847
+ new_validate = deepcopy(self)
3848
+
3849
+ # Process the new table through the centralized data processing pipeline
3850
+ new_validate.data = _process_data(tbl)
3851
+
3852
+ # Update table name if provided, otherwise keep existing
3853
+ if tbl_name is not None:
3854
+ new_validate.tbl_name = tbl_name
3855
+
3856
+ # Update label if provided, otherwise keep existing
3857
+ if label is not None:
3858
+ new_validate.label = label
3859
+
3860
+ # Reset interrogation state since we have a new table, but preserve validation steps
3861
+ new_validate.time_start = None
3862
+ new_validate.time_end = None
3863
+ # Note: We keep validation_info as it contains the defined validation steps
3864
+
3865
+ return new_validate
3866
+
3761
3867
  def _repr_html_(self) -> str:
3762
3868
  return self.get_tabular_report()._repr_html_() # pragma: no cover
3763
3869
 
pointblank/yaml.py CHANGED
@@ -4,6 +4,7 @@ from pathlib import Path
4
4
  from typing import Any, Union
5
5
 
6
6
  import yaml
7
+ from narwhals.typing import FrameT
7
8
 
8
9
  from pointblank._utils import _is_lib_present
9
10
  from pointblank.thresholds import Actions
@@ -749,7 +750,7 @@ class YAMLValidator:
749
750
  return validation
750
751
 
751
752
 
752
- def yaml_interrogate(yaml: Union[str, Path]) -> Validate:
753
+ def yaml_interrogate(yaml: Union[str, Path], set_tbl: Union[FrameT, Any, None] = None) -> Validate:
753
754
  """Execute a YAML-based validation workflow.
754
755
 
755
756
  This is the main entry point for YAML-based validation workflows. It takes YAML configuration
@@ -764,13 +765,20 @@ def yaml_interrogate(yaml: Union[str, Path]) -> Validate:
764
765
  yaml
765
766
  YAML configuration as string or file path. Can be: (1) a YAML string containing the
766
767
  validation configuration, or (2) a Path object or string path to a YAML file.
768
+ set_tbl
769
+ An optional table to override the table specified in the YAML configuration. This allows you
770
+ to apply a YAML-defined validation workflow to a different table than what's specified in
771
+ the configuration. If provided, this table will replace the table defined in the YAML's
772
+ `tbl` field before executing the validation workflow. This can be any supported table type
773
+ including DataFrame objects, Ibis table objects, CSV file paths, Parquet file paths, GitHub
774
+ URLs, or database connection strings.
767
775
 
768
776
  Returns
769
777
  -------
770
778
  Validate
771
- An instance of the `Validate` class that has been configured based on the YAML input.
772
- This object contains the results of the validation steps defined in the YAML configuration.
773
- It includes metadata like table name, label, language, and thresholds if specified.
779
+ An instance of the `Validate` class that has been configured based on the YAML input. This
780
+ object contains the results of the validation steps defined in the YAML configuration. It
781
+ includes metadata like table name, label, language, and thresholds if specified.
774
782
 
775
783
  Raises
776
784
  ------
@@ -875,10 +883,59 @@ def yaml_interrogate(yaml: Union[str, Path]) -> Validate:
875
883
  This approach is particularly useful for storing validation configurations as part of your data
876
884
  pipeline or version control system, allowing you to maintain validation rules alongside your
877
885
  code.
886
+
887
+ ### Using `set_tbl=` to Override the Table
888
+
889
+ The `set_tbl=` parameter allows you to override the table specified in the YAML configuration.
890
+ This is useful when you have a template validation workflow but want to apply it to different
891
+ tables:
892
+
893
+ ```{python}
894
+ import polars as pl
895
+
896
+ # Create a test table with similar structure to small_table
897
+ test_table = pl.DataFrame({
898
+ "date": ["2023-01-01", "2023-01-02", "2023-01-03"],
899
+ "a": [1, 2, 3],
900
+ "b": ["1-abc-123", "2-def-456", "3-ghi-789"],
901
+ "d": [150, 200, 250]
902
+ })
903
+
904
+ # Use the same YAML config but apply it to our test table
905
+ yaml_config = '''
906
+ tbl: small_table # This will be overridden
907
+ tbl_name: Test Table # This name will be used
908
+ steps:
909
+ - col_exists:
910
+ columns: [date, a, b, d]
911
+ - col_vals_gt:
912
+ columns: [d]
913
+ value: 100
914
+ '''
915
+
916
+ # Execute with table override
917
+ result = pb.yaml_interrogate(yaml_config, set_tbl=test_table)
918
+ print(f"Validation applied to: {result.tbl_name}")
919
+ result
920
+ ```
921
+
922
+ This feature makes YAML configurations more reusable and flexible, allowing you to define
923
+ validation logic once and apply it to multiple similar tables.
878
924
  """
879
925
  validator = YAMLValidator()
880
926
  config = validator.load_config(yaml)
881
- return validator.execute_workflow(config)
927
+
928
+ # If `set_tbl=` is provided, we need to build the validation workflow and then use `set_tbl()`
929
+ if set_tbl is not None:
930
+ # First build the validation object without interrogation
931
+ validation = validator.build_validation(config)
932
+ # Then replace the table using set_tbl method
933
+ validation = validation.set_tbl(tbl=set_tbl)
934
+ # Finally interrogate with the new table
935
+ return validation.interrogate()
936
+ else:
937
+ # Standard execution without table override (includes interrogation)
938
+ return validator.execute_workflow(config)
882
939
 
883
940
 
884
941
  def load_yaml_config(file_path: Union[str, Path]) -> dict:
@@ -1453,26 +1510,6 @@ def yaml_to_python(yaml: Union[str, Path]) -> str:
1453
1510
  action_params.append(f"highest_only={value.highest_only}")
1454
1511
  actions_str = "pb.Actions(" + ", ".join(action_params) + ")"
1455
1512
  param_parts.append(f"actions={actions_str}")
1456
- elif isinstance(value, dict):
1457
- action_params = []
1458
- step_action_base = f"steps[{step_index}].{list(step_config.keys())[0]}.actions"
1459
- for action_key, action_value in value.items():
1460
- if action_key == "highest_only":
1461
- action_params.append(f"{action_key}={action_value}")
1462
- else:
1463
- # Check if we have an original expression for this action
1464
- action_expr_path = f"{step_action_base}.{action_key}"
1465
- if action_expr_path in step_expressions:
1466
- action_params.append(
1467
- f"{action_key}={step_expressions[action_expr_path]}"
1468
- )
1469
- elif isinstance(action_value, str):
1470
- action_params.append(f'{action_key}="{action_value}"')
1471
- else:
1472
- # For callables or complex expressions
1473
- action_params.append(f"{action_key}={action_value}")
1474
- actions_str = "pb.Actions(" + ", ".join(action_params) + ")"
1475
- param_parts.append(f"actions={actions_str}")
1476
1513
  else:
1477
1514
  param_parts.append(f"actions={value}")
1478
1515
  elif key == "thresholds":
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pointblank
3
- Version: 0.12.1
3
+ Version: 0.13.0
4
4
  Summary: Find out if your data is what you think it is.
5
5
  Author-email: Richard Iannone <riannone@me.com>
6
6
  License: MIT License
@@ -60,6 +60,12 @@ Requires-Dist: chatlas>=0.3.0; extra == "generate"
60
60
  Requires-Dist: anthropic[bedrock]>=0.45.2; extra == "generate"
61
61
  Requires-Dist: openai>=1.63.0; extra == "generate"
62
62
  Requires-Dist: shiny>=1.3.0; extra == "generate"
63
+ Provides-Extra: mcp
64
+ Requires-Dist: mcp[cli]>=1.10.1; extra == "mcp"
65
+ Requires-Dist: fastmcp>=2.11.3; extra == "mcp"
66
+ Requires-Dist: pytest-asyncio>=1.0.0; extra == "mcp"
67
+ Provides-Extra: excel
68
+ Requires-Dist: openpyxl>=3.0.0; extra == "excel"
63
69
  Provides-Extra: bigquery
64
70
  Requires-Dist: ibis-framework[bigquery]>=9.5.0; extra == "bigquery"
65
71
  Provides-Extra: databricks
@@ -84,6 +90,7 @@ Requires-Dist: quartodoc>=0.8.1; python_version >= "3.9" and extra == "docs"
84
90
  Requires-Dist: pandas>=2.2.3; extra == "docs"
85
91
  Requires-Dist: polars>=1.17.1; extra == "docs"
86
92
  Requires-Dist: pyspark==3.5.6; extra == "docs"
93
+ Requires-Dist: openpyxl>=3.0.0; extra == "docs"
87
94
  Dynamic: license-file
88
95
 
89
96
  <div align="center">
@@ -3,7 +3,7 @@ pointblank/_constants.py,sha256=rB8qTnhabwmSQURevHqokC1pp5lfaWMCzhmbMZ0CP8A,8151
3
3
  pointblank/_constants_docs.py,sha256=JBmtt16zTYQ-zaM4ElLExtKs-dKlnN553Ys2ML1Y1C8,2099
4
4
  pointblank/_constants_translations.py,sha256=HXcCYmKoMjoaFv-Ym4UWv3AsIVXik2zDyAy7xvTvv0Y,186710
5
5
  pointblank/_datascan_utils.py,sha256=EMfeabXm_ZsCUKPROB7rFhyOpjtRs8jcnZ_9nBtMyws,1750
6
- pointblank/_interrogation.py,sha256=a0O30kY6GQmeqkAPZqBynFJHsmwFXr6pimpNL2uUPaU,76996
6
+ pointblank/_interrogation.py,sha256=p3qPTgcsYiDEyV9d5pWLzAqz9rU9-IsfmSFV4sWRBNI,76932
7
7
  pointblank/_typing.py,sha256=aItbCbzhbzqjK3lCbL27ltRyXoAH1c3-U6xQdRzg-lU,1594
8
8
  pointblank/_utils.py,sha256=ikgkFomoAEOxaiItHZUo3NTHu0MJHWfKAF_fnX9rRnA,30685
9
9
  pointblank/_utils_check_args.py,sha256=rFEc1nbCN8ftsQQWVjCNWmQ2QmUDxkfgmoJclrZeTLs,5489
@@ -21,9 +21,9 @@ pointblank/schema.py,sha256=vwGF8UKy2riRSQzcwatcI6L0t_6ccdbOayrKonvyodE,45777
21
21
  pointblank/segments.py,sha256=RXp3lPr3FboVseadNqLgIeoMBh_mykrQSFp1WtV41Yg,5570
22
22
  pointblank/tf.py,sha256=8o_8m4i01teulEe3-YYMotSNf3tImjBMInsvdjSAO5Q,8844
23
23
  pointblank/thresholds.py,sha256=mybeLzTVdmN04NLKoV-jiSBXsWknwHO0Gox0ttVN_MU,25766
24
- pointblank/validate.py,sha256=KvnC0UnvVW2mkoWkp1fDIXotuBl7MJeU6_ggp_0yDoo,693082
25
- pointblank/yaml.py,sha256=4DrkOJwCQ3CaXQ7ESNIW72pp-dL1ctlX6ONU30Vh1Fs,57901
26
- pointblank/data/api-docs.txt,sha256=0wXk__xYwgKeS24ZjbaTPFeJ3ZO7AIyMQoFClCcvPTc,529897
24
+ pointblank/validate.py,sha256=py6w239Mh7tbAfXJkanDLARCkWE5EFhTlfvS0KOjnWA,697215
25
+ pointblank/yaml.py,sha256=Sy802CZBOgEZGwbIes8wcXPPt2a5rXO0b3lh9tsLS8w,58966
26
+ pointblank/data/api-docs.txt,sha256=w2nIkIL_fJpXlPR9clogqcgdiv-uHvdSDI8gjkP_mCQ,531711
27
27
  pointblank/data/game_revenue-duckdb.zip,sha256=tKIVx48OGLYGsQPS3h5AjA2Nyq_rfEpLCjBiFUWhagU,35880
28
28
  pointblank/data/game_revenue.zip,sha256=7c9EvHLyi93CHUd4p3dM4CZ-GucFCtXKSPxgLojL32U,33749
29
29
  pointblank/data/global_sales-duckdb.zip,sha256=2ok_cvJ1ZuSkXnw0R6_OkKYRTWhJ-jJEMq2VYsv5fqY,1336390
@@ -33,9 +33,9 @@ pointblank/data/nycflights.zip,sha256=yVjbUaKUz2LydSdF9cABuir0VReHBBgV7shiNWSd0m
33
33
  pointblank/data/polars-api-docs.txt,sha256=KGcS-BOtUs9zgpkWfXD-GFdFh4O_zjdkpX7msHjztLg,198045
34
34
  pointblank/data/small_table-duckdb.zip,sha256=BhTaZ2CRS4-9Z1uVhOU6HggvW3XCar7etMznfENIcOc,2028
35
35
  pointblank/data/small_table.zip,sha256=lmFb90Nb-v5X559Ikjg31YLAXuRyMkD9yLRElkXPMzQ,472
36
- pointblank-0.12.1.dist-info/licenses/LICENSE,sha256=apLF-HWPNU7pT5bmf5KmZpD5Cklpy2u-BN_0xBoRMLY,1081
37
- pointblank-0.12.1.dist-info/METADATA,sha256=1fJY92u1AiJdYggJLaUf0TKbovh3ytcihIdh4PcBEQ8,19242
38
- pointblank-0.12.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
39
- pointblank-0.12.1.dist-info/entry_points.txt,sha256=GqqqOTOH8uZe22wLcvYjzpizqk_j4MNcUo2YM14ryCw,42
40
- pointblank-0.12.1.dist-info/top_level.txt,sha256=-wHrS1SvV8-nhvc3w-PPYs1C1WtEc1pK-eGjubbCCKc,11
41
- pointblank-0.12.1.dist-info/RECORD,,
36
+ pointblank-0.13.0.dist-info/licenses/LICENSE,sha256=apLF-HWPNU7pT5bmf5KmZpD5Cklpy2u-BN_0xBoRMLY,1081
37
+ pointblank-0.13.0.dist-info/METADATA,sha256=A-tNLSbVOz6M27ZVq_ihOQiOdTtEMs3ub8T27kK_DSY,19529
38
+ pointblank-0.13.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
39
+ pointblank-0.13.0.dist-info/entry_points.txt,sha256=GqqqOTOH8uZe22wLcvYjzpizqk_j4MNcUo2YM14ryCw,42
40
+ pointblank-0.13.0.dist-info/top_level.txt,sha256=-wHrS1SvV8-nhvc3w-PPYs1C1WtEc1pK-eGjubbCCKc,11
41
+ pointblank-0.13.0.dist-info/RECORD,,