pointblank 0.11.5__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pointblank/__init__.py CHANGED
@@ -25,6 +25,7 @@ from pointblank.column import (
25
25
  from pointblank.datascan import DataScan, col_summary_tbl
26
26
  from pointblank.draft import DraftValidation
27
27
  from pointblank.schema import Schema
28
+ from pointblank.segments import seg_group
28
29
  from pointblank.tf import TF
29
30
  from pointblank.thresholds import Actions, FinalActions, Thresholds
30
31
  from pointblank.validate import (
@@ -76,6 +77,7 @@ __all__ = [
76
77
  "get_validation_summary",
77
78
  "get_column_count",
78
79
  "get_row_count",
80
+ "seg_group",
79
81
  "send_slack_notification",
80
82
  # YAML functionality
81
83
  "yaml_interrogate",
pointblank/_constants.py CHANGED
@@ -118,7 +118,6 @@ IBIS_BACKENDS = [
118
118
  "mysql",
119
119
  "parquet",
120
120
  "postgres",
121
- "pyspark",
122
121
  "snowflake",
123
122
  "sqlite",
124
123
  ]
@@ -23,6 +23,74 @@ if TYPE_CHECKING:
23
23
  from pointblank._typing import AbsoluteTolBounds
24
24
 
25
25
 
26
+ def _safe_modify_datetime_compare_val(data_frame: Any, column: str, compare_val: Any) -> Any:
27
+ """
28
+ Safely modify datetime comparison values for LazyFrame compatibility.
29
+
30
+ This function handles the case where we can't directly slice LazyFrames
31
+ to get column dtypes for datetime conversion.
32
+ """
33
+ try:
34
+ # First try to get column dtype from schema for LazyFrames
35
+ column_dtype = None
36
+
37
+ if hasattr(data_frame, "collect_schema"):
38
+ schema = data_frame.collect_schema()
39
+ column_dtype = schema.get(column)
40
+ elif hasattr(data_frame, "schema"):
41
+ schema = data_frame.schema
42
+ column_dtype = schema.get(column)
43
+
44
+ # If we got a dtype from schema, use it
45
+ if column_dtype is not None:
46
+ # Create a mock column object for _modify_datetime_compare_val
47
+ class MockColumn:
48
+ def __init__(self, dtype):
49
+ self.dtype = dtype
50
+
51
+ mock_column = MockColumn(column_dtype)
52
+ return _modify_datetime_compare_val(tgt_column=mock_column, compare_val=compare_val)
53
+
54
+ # Fallback: try collecting a small sample if possible
55
+ try:
56
+ sample = data_frame.head(1).collect()
57
+ if hasattr(sample, "dtypes") and column in sample.columns:
58
+ # For pandas-like dtypes
59
+ column_dtype = sample.dtypes[column] if hasattr(sample, "dtypes") else None
60
+ if column_dtype:
61
+
62
+ class MockColumn:
63
+ def __init__(self, dtype):
64
+ self.dtype = dtype
65
+
66
+ mock_column = MockColumn(column_dtype)
67
+ return _modify_datetime_compare_val(
68
+ tgt_column=mock_column, compare_val=compare_val
69
+ )
70
+ except Exception:
71
+ pass
72
+
73
+ # Final fallback: try direct access (for eager DataFrames)
74
+ try:
75
+ if hasattr(data_frame, "dtypes") and column in data_frame.columns:
76
+ column_dtype = data_frame.dtypes[column]
77
+
78
+ class MockColumn:
79
+ def __init__(self, dtype):
80
+ self.dtype = dtype
81
+
82
+ mock_column = MockColumn(column_dtype)
83
+ return _modify_datetime_compare_val(tgt_column=mock_column, compare_val=compare_val)
84
+ except Exception:
85
+ pass
86
+
87
+ except Exception:
88
+ pass
89
+
90
+ # If all else fails, return the original compare_val
91
+ return compare_val
92
+
93
+
26
94
  @dataclass
27
95
  class Interrogator:
28
96
  """
@@ -136,9 +204,7 @@ class Interrogator:
136
204
 
137
205
  compare_expr = _get_compare_expr_nw(compare=self.compare)
138
206
 
139
- compare_expr = _modify_datetime_compare_val(
140
- tgt_column=self.x[self.column], compare_val=compare_expr
141
- )
207
+ compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, compare_expr)
142
208
 
143
209
  return (
144
210
  self.x.with_columns(
@@ -211,9 +277,7 @@ class Interrogator:
211
277
 
212
278
  compare_expr = _get_compare_expr_nw(compare=self.compare)
213
279
 
214
- compare_expr = _modify_datetime_compare_val(
215
- tgt_column=self.x[self.column], compare_val=compare_expr
216
- )
280
+ compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, compare_expr)
217
281
 
218
282
  return (
219
283
  self.x.with_columns(
@@ -329,9 +393,7 @@ class Interrogator:
329
393
  else:
330
394
  compare_expr = _get_compare_expr_nw(compare=self.compare)
331
395
 
332
- compare_expr = _modify_datetime_compare_val(
333
- tgt_column=self.x[self.column], compare_val=compare_expr
334
- )
396
+ compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, compare_expr)
335
397
 
336
398
  tbl = self.x.with_columns(
337
399
  pb_is_good_1=nw.col(self.column).is_null() & self.na_pass,
@@ -421,9 +483,7 @@ class Interrogator:
421
483
  ).to_native()
422
484
 
423
485
  else:
424
- compare_expr = _modify_datetime_compare_val(
425
- tgt_column=self.x[self.column], compare_val=self.compare
426
- )
486
+ compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, self.compare)
427
487
 
428
488
  return self.x.with_columns(
429
489
  pb_is_good_=nw.col(self.column) != nw.lit(compare_expr),
@@ -544,9 +604,7 @@ class Interrogator:
544
604
  if ref_col_has_null_vals:
545
605
  # Create individual cases for Pandas and Polars
546
606
 
547
- compare_expr = _modify_datetime_compare_val(
548
- tgt_column=self.x[self.column], compare_val=self.compare
549
- )
607
+ compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, self.compare)
550
608
 
551
609
  if is_pandas_dataframe(self.x.to_native()):
552
610
  tbl = self.x.with_columns(
@@ -584,6 +642,25 @@ class Interrogator:
584
642
 
585
643
  return tbl
586
644
 
645
+ else:
646
+ # Generic case for other DataFrame types (PySpark, etc.)
647
+ # Use similar logic to Polars but handle potential differences
648
+ tbl = self.x.with_columns(
649
+ pb_is_good_1=nw.col(self.column).is_null(), # val is Null in Column
650
+ pb_is_good_2=nw.lit(self.na_pass), # Pass if any Null in val or compare
651
+ )
652
+
653
+ tbl = tbl.with_columns(pb_is_good_3=nw.col(self.column) != nw.lit(compare_expr))
654
+
655
+ tbl = tbl.with_columns(
656
+ pb_is_good_=(
657
+ (nw.col("pb_is_good_1") & nw.col("pb_is_good_2"))
658
+ | (nw.col("pb_is_good_3") & ~nw.col("pb_is_good_1"))
659
+ )
660
+ )
661
+
662
+ return tbl.drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3").to_native()
663
+
587
664
  def ge(self) -> FrameT | Any:
588
665
  # Ibis backends ---------------------------------------------
589
666
 
@@ -629,9 +706,7 @@ class Interrogator:
629
706
 
630
707
  compare_expr = _get_compare_expr_nw(compare=self.compare)
631
708
 
632
- compare_expr = _modify_datetime_compare_val(
633
- tgt_column=self.x[self.column], compare_val=compare_expr
634
- )
709
+ compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, compare_expr)
635
710
 
636
711
  tbl = (
637
712
  self.x.with_columns(
@@ -702,9 +777,7 @@ class Interrogator:
702
777
 
703
778
  compare_expr = _get_compare_expr_nw(compare=self.compare)
704
779
 
705
- compare_expr = _modify_datetime_compare_val(
706
- tgt_column=self.x[self.column], compare_val=compare_expr
707
- )
780
+ compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, compare_expr)
708
781
 
709
782
  return (
710
783
  self.x.with_columns(
@@ -834,10 +907,8 @@ class Interrogator:
834
907
  low_val = _get_compare_expr_nw(compare=self.low)
835
908
  high_val = _get_compare_expr_nw(compare=self.high)
836
909
 
837
- low_val = _modify_datetime_compare_val(tgt_column=self.x[self.column], compare_val=low_val)
838
- high_val = _modify_datetime_compare_val(
839
- tgt_column=self.x[self.column], compare_val=high_val
840
- )
910
+ low_val = _safe_modify_datetime_compare_val(self.x, self.column, low_val)
911
+ high_val = _safe_modify_datetime_compare_val(self.x, self.column, high_val)
841
912
 
842
913
  tbl = self.x.with_columns(
843
914
  pb_is_good_1=nw.col(self.column).is_null(), # val is Null in Column
@@ -1026,10 +1097,8 @@ class Interrogator:
1026
1097
  low_val = _get_compare_expr_nw(compare=self.low)
1027
1098
  high_val = _get_compare_expr_nw(compare=self.high)
1028
1099
 
1029
- low_val = _modify_datetime_compare_val(tgt_column=self.x[self.column], compare_val=low_val)
1030
- high_val = _modify_datetime_compare_val(
1031
- tgt_column=self.x[self.column], compare_val=high_val
1032
- )
1100
+ low_val = _safe_modify_datetime_compare_val(self.x, self.column, low_val)
1101
+ high_val = _safe_modify_datetime_compare_val(self.x, self.column, high_val)
1033
1102
 
1034
1103
  tbl = self.x.with_columns(
1035
1104
  pb_is_good_1=nw.col(self.column).is_null(), # val is Null in Column
@@ -1209,14 +1278,15 @@ class Interrogator:
1209
1278
  else:
1210
1279
  columns_subset = self.columns_subset
1211
1280
 
1212
- # Create a subset of the table with only the columns of interest
1213
- subset_tbl = tbl.select(columns_subset)
1281
+ # Create a count of duplicates using group_by approach like Ibis backend
1282
+ # Group by the columns of interest and count occurrences
1283
+ count_tbl = tbl.group_by(columns_subset).agg(nw.len().alias("pb_count_"))
1214
1284
 
1215
- # Check for duplicates in the subset table, creating a series of booleans
1216
- pb_is_good_series = subset_tbl.is_duplicated()
1285
+ # Join back to original table to get count for each row
1286
+ tbl = tbl.join(count_tbl, on=columns_subset, how="left")
1217
1287
 
1218
- # Add the series to the input table
1219
- tbl = tbl.with_columns(pb_is_good_=~pb_is_good_series)
1288
+ # Passing rows will have the value `1` (no duplicates, so True), otherwise False applies
1289
+ tbl = tbl.with_columns(pb_is_good_=nw.col("pb_count_") == 1).drop("pb_count_")
1220
1290
 
1221
1291
  return tbl.to_native()
1222
1292
 
@@ -2088,6 +2158,8 @@ class ConjointlyValidation:
2088
2158
  return self._get_pandas_results()
2089
2159
  elif "duckdb" in self.tbl_type or "ibis" in self.tbl_type:
2090
2160
  return self._get_ibis_results()
2161
+ elif "pyspark" in self.tbl_type:
2162
+ return self._get_pyspark_results()
2091
2163
  else: # pragma: no cover
2092
2164
  raise NotImplementedError(f"Support for {self.tbl_type} is not yet implemented")
2093
2165
 
@@ -2247,6 +2319,53 @@ class ConjointlyValidation:
2247
2319
  results_tbl = self.data_tbl.mutate(pb_is_good_=ibis.literal(True))
2248
2320
  return results_tbl
2249
2321
 
2322
+ def _get_pyspark_results(self):
2323
+ """Process expressions for PySpark DataFrames."""
2324
+ from pyspark.sql import functions as F
2325
+
2326
+ pyspark_columns = []
2327
+
2328
+ for expr_fn in self.expressions:
2329
+ try:
2330
+ # First try direct evaluation with PySpark DataFrame
2331
+ expr_result = expr_fn(self.data_tbl)
2332
+
2333
+ # Check if it's a PySpark Column
2334
+ if hasattr(expr_result, "_jc"): # PySpark Column has _jc attribute
2335
+ pyspark_columns.append(expr_result)
2336
+ else:
2337
+ raise TypeError(
2338
+ f"Expression returned {type(expr_result)}, expected PySpark Column"
2339
+ )
2340
+
2341
+ except Exception as e:
2342
+ try:
2343
+ # Try as a ColumnExpression (for pb.expr_col style)
2344
+ col_expr = expr_fn(None)
2345
+
2346
+ if hasattr(col_expr, "to_pyspark_expr"):
2347
+ # Convert to PySpark expression
2348
+ pyspark_expr = col_expr.to_pyspark_expr(self.data_tbl)
2349
+ pyspark_columns.append(pyspark_expr)
2350
+ else:
2351
+ raise TypeError(f"Cannot convert {type(col_expr)} to PySpark Column")
2352
+ except Exception as nested_e:
2353
+ print(f"Error evaluating PySpark expression: {e} -> {nested_e}")
2354
+
2355
+ # Combine results with AND logic
2356
+ if pyspark_columns:
2357
+ final_result = pyspark_columns[0]
2358
+ for col in pyspark_columns[1:]:
2359
+ final_result = final_result & col
2360
+
2361
+ # Create results table with boolean column
2362
+ results_tbl = self.data_tbl.withColumn("pb_is_good_", final_result)
2363
+ return results_tbl
2364
+
2365
+ # Default case
2366
+ results_tbl = self.data_tbl.withColumn("pb_is_good_", F.lit(True))
2367
+ return results_tbl
2368
+
2250
2369
 
2251
2370
  class SpeciallyValidation:
2252
2371
  def __init__(self, data_tbl, expression, threshold, tbl_type):
@@ -2359,13 +2478,22 @@ class NumberOfTestUnits:
2359
2478
  column: str
2360
2479
 
2361
2480
  def get_test_units(self, tbl_type: str) -> int:
2362
- if tbl_type == "pandas" or tbl_type == "polars":
2481
+ if (
2482
+ tbl_type == "pandas"
2483
+ or tbl_type == "polars"
2484
+ or tbl_type == "pyspark"
2485
+ or tbl_type == "local"
2486
+ ):
2363
2487
  # Convert the DataFrame to a format that narwhals can work with and:
2364
2488
  # - check if the column exists
2365
2489
  dfn = _column_test_prep(
2366
2490
  df=self.df, column=self.column, allowed_types=None, check_exists=False
2367
2491
  )
2368
2492
 
2493
+ # Handle LazyFrames which don't have len()
2494
+ if hasattr(dfn, "collect"):
2495
+ dfn = dfn.collect()
2496
+
2369
2497
  return len(dfn)
2370
2498
 
2371
2499
  if tbl_type in IBIS_BACKENDS:
@@ -2383,7 +2511,22 @@ def _get_compare_expr_nw(compare: Any) -> Any:
2383
2511
 
2384
2512
 
2385
2513
  def _column_has_null_values(table: FrameT, column: str) -> bool:
2386
- null_count = (table.select(column).null_count())[column][0]
2514
+ try:
2515
+ # Try the standard null_count() method
2516
+ null_count = (table.select(column).null_count())[column][0]
2517
+ except AttributeError:
2518
+ # For LazyFrames, collect first then get null count
2519
+ try:
2520
+ collected = table.select(column).collect()
2521
+ null_count = (collected.null_count())[column][0]
2522
+ except Exception:
2523
+ # Fallback: check if any values are null
2524
+ try:
2525
+ result = table.select(nw.col(column).is_null().sum().alias("null_count")).collect()
2526
+ null_count = result["null_count"][0]
2527
+ except Exception:
2528
+ # Last resort: return False (assume no nulls)
2529
+ return False
2387
2530
 
2388
2531
  if null_count is None or null_count == 0:
2389
2532
  return False
@@ -2414,7 +2557,7 @@ def _check_nulls_across_columns_nw(table, columns_subset):
2414
2557
 
2415
2558
  # Build the expression by combining each column's `is_null()` with OR operations
2416
2559
  null_expr = functools.reduce(
2417
- lambda acc, col: acc | table[col].is_null() if acc is not None else table[col].is_null(),
2560
+ lambda acc, col: acc | nw.col(col).is_null() if acc is not None else nw.col(col).is_null(),
2418
2561
  column_names,
2419
2562
  None,
2420
2563
  )
pointblank/_utils.py CHANGED
@@ -66,11 +66,13 @@ def _get_tbl_type(data: FrameT | Any) -> str:
66
66
  except Exception as e:
67
67
  raise TypeError("The `data` object is not a DataFrame or Ibis Table.") from e
68
68
 
69
- # Detect through regex if the table is a polars or pandas DataFrame
69
+ # Detect through regex if the table is a polars, pandas, or Spark DataFrame
70
70
  if re.search(r"polars", df_ns_str, re.IGNORECASE):
71
71
  return "polars"
72
72
  elif re.search(r"pandas", df_ns_str, re.IGNORECASE):
73
73
  return "pandas"
74
+ elif re.search(r"pyspark", df_ns_str, re.IGNORECASE):
75
+ return "pyspark"
74
76
 
75
77
  # If ibis is present, then get the table's backend name
76
78
  ibis_present = _is_lib_present(lib_name="ibis")
@@ -164,7 +166,7 @@ def _check_any_df_lib(method_used: str) -> None:
164
166
  def _is_value_a_df(value: Any) -> bool:
165
167
  try:
166
168
  ns = nw.get_native_namespace(value)
167
- if "polars" in str(ns) or "pandas" in str(ns):
169
+ if "polars" in str(ns) or "pandas" in str(ns) or "pyspark" in str(ns):
168
170
  return True
169
171
  else: # pragma: no cover
170
172
  return False
@@ -619,6 +621,10 @@ def _get_api_text() -> str:
619
621
  "expr_col",
620
622
  ]
621
623
 
624
+ segments_exported = [
625
+ "seg_group",
626
+ ]
627
+
622
628
  interrogation_exported = [
623
629
  "Validate.interrogate",
624
630
  "Validate.get_tabular_report",
@@ -648,6 +654,12 @@ def _get_api_text() -> str:
648
654
  "assistant",
649
655
  "load_dataset",
650
656
  "get_data_path",
657
+ "connect_to_table",
658
+ ]
659
+
660
+ yaml_exported = [
661
+ "yaml_interrogate",
662
+ "validate_yaml",
651
663
  ]
652
664
 
653
665
  utility_exported = [
@@ -679,6 +691,10 @@ many steps). Furthermore, the `col()` function can be used to declare a comparis
679
691
  for the `value=` argument in many `col_vals_*()` methods) when you can't use a fixed value
680
692
  for comparison."""
681
693
 
694
+ segments_desc = (
695
+ """Combine multiple values into a single segment using `seg_*()` helper functions."""
696
+ )
697
+
682
698
  interrogation_desc = """The validation plan is put into action when `interrogate()` is called.
683
699
  The workflow for performing a comprehensive validation is then: (1) `Validate()`, (2) adding
684
700
  validation steps, (3) `interrogate()`. After interrogation of the data, we can view a validation
@@ -694,6 +710,11 @@ datasets included in the package can be accessed via the `load_dataset()` functi
694
710
  `config()` utility lets us set global configuration parameters. Want to chat with an assistant? Use
695
711
  the `assistant()` function to get help with Pointblank."""
696
712
 
713
+ yaml_desc = """The *YAML* group contains functions that allow for the use of YAML to orchestrate
714
+ validation workflows. The `yaml_interrogate()` function can be used to run a validation workflow from
715
+ YAML strings or files. The `validate_yaml()` function checks if the YAML configuration
716
+ passes its own validity checks."""
717
+
697
718
  utility_desc = """The Utility Functions group contains functions that are useful for accessing
698
719
  metadata about the target data. Use `get_column_count()` or `get_row_count()` to get the number of
699
720
  columns or rows in a table. The `get_action_metadata()` function is useful when building custom
@@ -718,12 +739,18 @@ table information, and timing details."""
718
739
  api_text += f"""\n## The Column Selection family\n\n{column_selection_desc}\n\n"""
719
740
  api_text += get_api_details(module=pointblank, exported_list=column_selection_exported)
720
741
 
742
+ api_text += f"""\n## The Segments family\n\n{segments_desc}\n\n"""
743
+ api_text += get_api_details(module=pointblank, exported_list=segments_exported)
744
+
721
745
  api_text += f"""\n## The Interrogation and Reporting family\n\n{interrogation_desc}\n\n"""
722
746
  api_text += get_api_details(module=pointblank, exported_list=interrogation_exported)
723
747
 
724
748
  api_text += f"""\n## The Inspection and Assistance family\n\n{inspect_desc}\n\n"""
725
749
  api_text += get_api_details(module=pointblank, exported_list=inspect_exported)
726
750
 
751
+ api_text += f"""\n## The YAML family\n\n{yaml_desc}\n\n"""
752
+ api_text += get_api_details(module=pointblank, exported_list=yaml_exported)
753
+
727
754
  api_text += f"""\n## The Utility Functions family\n\n{utility_desc}\n\n"""
728
755
  api_text += get_api_details(module=pointblank, exported_list=utility_exported)
729
756
 
pointblank/assistant.py CHANGED
@@ -138,10 +138,15 @@ def assistant(
138
138
 
139
139
  - Polars DataFrame (`"polars"`)
140
140
  - Pandas DataFrame (`"pandas"`)
141
+ - PySpark table (`"pyspark"`)
141
142
  - DuckDB table (`"duckdb"`)*
142
143
  - MySQL table (`"mysql"`)*
143
144
  - PostgreSQL table (`"postgresql"`)*
144
145
  - SQLite table (`"sqlite"`)*
146
+ - Microsoft SQL Server table (`"mssql"`)*
147
+ - Snowflake table (`"snowflake"`)*
148
+ - Databricks table (`"databricks"`)*
149
+ - BigQuery table (`"bigquery"`)*
145
150
  - Parquet table (`"parquet"`)*
146
151
  - CSV files (string path or `pathlib.Path` object with `.csv` extension)
147
152
  - Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
@@ -152,6 +157,10 @@ def assistant(
152
157
  `ibis.expr.types.relations.Table`). Furthermore, using `assistant()` with these types of tables
153
158
  requires the Ibis library (`v9.5.0` or above) to be installed. If the input table is a Polars or
154
159
  Pandas DataFrame, the availability of Ibis is not needed.
160
+
161
+ To use a CSV file, ensure that a string or `pathlib.Path` object with a `.csv` extension is
162
+ provided. The file will be automatically detected and loaded using the best available DataFrame
163
+ library. The loading preference is Polars first, then Pandas as a fallback.
155
164
  """
156
165
 
157
166
  # Check that the chatlas package is installed
pointblank/cli.py CHANGED
@@ -1360,10 +1360,10 @@ def preview(
1360
1360
  For tables with many columns, use these options to control which columns are displayed:
1361
1361
 
1362
1362
  \b
1363
- - --columns: Specify exact columns (e.g., --columns "name,age,email")
1364
- - --col-range: Select column range (e.g., --col-range "1:10", --col-range "5:", --col-range ":15")
1365
- - --col-first: Show first N columns (e.g., --col-first 5)
1366
- - --col-last: Show last N columns (e.g., --col-last 3)
1363
+ - --columns: Specify exact columns (--columns "name,age,email")
1364
+ - --col-range: Select column range (--col-range "1:10", --col-range "5:", --col-range ":15")
1365
+ - --col-first: Show first N columns (--col-first 5)
1366
+ - --col-last: Show last N columns (--col-last 3)
1367
1367
 
1368
1368
  Tables with >15 columns automatically show first 7 and last 7 columns with indicators.
1369
1369
  """
@@ -1920,31 +1920,43 @@ def validate(
1920
1920
 
1921
1921
  AVAILABLE CHECK_TYPES:
1922
1922
 
1923
- Use --list-checks to see all available validation methods with examples.
1924
-
1925
- The default CHECK_TYPE is 'rows-distinct' which checks for duplicate rows.
1923
+ Require no additional options:
1926
1924
 
1927
1925
  \b
1928
1926
  - rows-distinct: Check if all rows in the dataset are unique (no duplicates)
1929
1927
  - rows-complete: Check if all rows are complete (no missing values in any column)
1930
- - col-exists: Check if a specific column exists in the dataset (requires --column)
1931
- - col-vals-not-null: Check if all values in a column are not null/missing (requires --column)
1932
- - col-vals-gt: Check if all values in a column are greater than a comparison value (requires --column and --value)
1933
- - col-vals-ge: Check if all values in a column are greater than or equal to a comparison value (requires --column and --value)
1934
- - col-vals-lt: Check if all values in a column are less than a comparison value (requires --column and --value)
1935
- - col-vals-le: Check if all values in a column are less than or equal to a comparison value (requires --column and --value)
1936
- - col-vals-in-set: Check if all values in a column are in an allowed set (requires --column and --set)
1928
+
1929
+ Require --column:
1930
+
1931
+ \b
1932
+ - col-exists: Check if a specific column exists in the dataset
1933
+ - col-vals-not-null: Check if all values in a column are not null/missing
1934
+
1935
+ Require --column and --value:
1936
+
1937
+ \b
1938
+ - col-vals-gt: Check if column values are greater than a fixed value
1939
+ - col-vals-ge: Check if column values are greater than or equal to a fixed value
1940
+ - col-vals-lt: Check if column values are less than a fixed value
1941
+ - col-vals-le: Check if column values are less than or equal to a fixed value
1942
+
1943
+ Require --column and --set:
1944
+
1945
+ \b
1946
+ - col-vals-in-set: Check if column values are in an allowed set
1947
+
1948
+ Use --list-checks to see all available validation methods with examples. The default CHECK_TYPE
1949
+ is 'rows-distinct' which checks for duplicate rows.
1937
1950
 
1938
1951
  Examples:
1939
1952
 
1940
1953
  \b
1941
- pb validate data.csv # Uses default validation (rows-distinct)
1942
- pb validate data.csv --list-checks # Show all available checks
1954
+ pb validate data.csv # Uses default validation (rows-distinct)
1955
+ pb validate data.csv --list-checks # Show all available checks
1943
1956
  pb validate data.csv --check rows-distinct
1944
1957
  pb validate data.csv --check rows-distinct --show-extract
1945
1958
  pb validate data.csv --check rows-distinct --write-extract failing_rows_folder
1946
1959
  pb validate data.csv --check rows-distinct --exit-code
1947
- pb validate data.csv --check rows-complete
1948
1960
  pb validate data.csv --check col-exists --column price
1949
1961
  pb validate data.csv --check col-vals-not-null --column email
1950
1962
  pb validate data.csv --check col-vals-gt --column score --value 50
@@ -1952,7 +1964,6 @@ def validate(
1952
1964
 
1953
1965
  Multiple validations in one command:
1954
1966
  pb validate data.csv --check rows-distinct --check rows-complete
1955
- pb validate data.csv --check col-vals-not-null --column email --check col-vals-gt --column age --value 18
1956
1967
  """
1957
1968
  try:
1958
1969
  import sys
@@ -4627,36 +4638,40 @@ def pl(
4627
4638
  pb pl "pl.read_csv('data.csv').select(['name', 'age'])"
4628
4639
  pb pl "pl.read_csv('data.csv').filter(pl.col('age') > 25)"
4629
4640
 
4641
+ \b
4630
4642
  # Multi-line with editor (supports multiple statements)
4631
4643
  pb pl --edit
4632
4644
 
4645
+ \b
4633
4646
  # Multi-statement code example in editor:
4634
4647
  # csv = pl.read_csv('data.csv')
4635
4648
  # result = csv.select(['name', 'age']).filter(pl.col('age') > 25)
4636
4649
 
4650
+ \b
4637
4651
  # Multi-line with a specific editor
4638
4652
  pb pl --edit --editor nano
4639
4653
  pb pl --edit --editor code
4640
4654
  pb pl --edit --editor micro
4641
4655
 
4656
+ \b
4642
4657
  # From file
4643
4658
  pb pl --file query.py
4644
4659
 
4645
- # Piping to other pb commands
4646
- pb pl "pl.read_csv('data.csv').filter(pl.col('age') > 25)" --pipe | pb validate --check rows-distinct
4660
+ \b
4661
+ Piping to other pb commands
4662
+ pb pl "pl.read_csv('data.csv').head(20)" --pipe | pb validate --check rows-distinct
4647
4663
  pb pl --edit --pipe | pb preview --head 10
4648
4664
  pb pl --edit --pipe | pb scan --output-html report.html
4649
4665
  pb pl --edit --pipe | pb missing --output-html missing_report.html
4650
4666
 
4651
- Use --output-format to change how results are displayed:
4652
-
4653
4667
  \b
4668
+ Use --output-format to change how results are displayed:
4654
4669
  pb pl "pl.read_csv('data.csv')" --output-format scan
4655
4670
  pb pl "pl.read_csv('data.csv')" --output-format missing
4656
4671
  pb pl "pl.read_csv('data.csv')" --output-format info
4657
4672
 
4658
- Note: For multi-statement code, assign your final result to a variable like
4659
- 'result', 'df', 'data', or ensure it's the last expression.
4673
+ Note: For multi-statement code, assign your final result to a variable like 'result', 'df',
4674
+ 'data', or ensure it's the last expression.
4660
4675
  """
4661
4676
  try:
4662
4677
  # Check if Polars is available