pointblank 0.13.4__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. pointblank/__init__.py +4 -0
  2. pointblank/_constants.py +117 -0
  3. pointblank/_constants_translations.py +487 -2
  4. pointblank/_interrogation.py +1065 -12
  5. pointblank/_spec_utils.py +1015 -0
  6. pointblank/_utils.py +17 -7
  7. pointblank/_utils_ai.py +875 -0
  8. pointblank/assistant.py +1 -1
  9. pointblank/cli.py +128 -115
  10. pointblank/column.py +1 -1
  11. pointblank/data/api-docs.txt +1838 -130
  12. pointblank/data/validations/README.md +108 -0
  13. pointblank/data/validations/complex_preprocessing.json +54 -0
  14. pointblank/data/validations/complex_preprocessing.pkl +0 -0
  15. pointblank/data/validations/generate_test_files.py +127 -0
  16. pointblank/data/validations/multiple_steps.json +83 -0
  17. pointblank/data/validations/multiple_steps.pkl +0 -0
  18. pointblank/data/validations/narwhals_function.json +28 -0
  19. pointblank/data/validations/narwhals_function.pkl +0 -0
  20. pointblank/data/validations/no_preprocessing.json +83 -0
  21. pointblank/data/validations/no_preprocessing.pkl +0 -0
  22. pointblank/data/validations/pandas_compatible.json +28 -0
  23. pointblank/data/validations/pandas_compatible.pkl +0 -0
  24. pointblank/data/validations/preprocessing_functions.py +46 -0
  25. pointblank/data/validations/simple_preprocessing.json +57 -0
  26. pointblank/data/validations/simple_preprocessing.pkl +0 -0
  27. pointblank/datascan.py +4 -4
  28. pointblank/draft.py +52 -3
  29. pointblank/scan_profile.py +6 -6
  30. pointblank/schema.py +8 -82
  31. pointblank/thresholds.py +1 -1
  32. pointblank/validate.py +3069 -437
  33. {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/METADATA +67 -8
  34. pointblank-0.15.0.dist-info/RECORD +56 -0
  35. pointblank-0.13.4.dist-info/RECORD +0 -39
  36. {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/WHEEL +0 -0
  37. {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/entry_points.txt +0 -0
  38. {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/licenses/LICENSE +0 -0
  39. {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/top_level.txt +0 -0
pointblank/assistant.py CHANGED
@@ -55,7 +55,7 @@ def assistant(
55
55
  ----------
56
56
  model
57
57
  The model to be used. This should be in the form of `provider:model` (e.g.,
58
- `"anthropic:claude-3-5-sonnet-latest"`). Supported providers are `"anthropic"`, `"openai"`,
58
+ `"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`,
59
59
  `"ollama"`, and `"bedrock"`.
60
60
  data
61
61
  An optional data table to focus on during discussion with the PbA, which could be a
pointblank/cli.py CHANGED
@@ -295,6 +295,46 @@ def _format_dtype_compact(dtype_str: str) -> str:
295
295
  return dtype_str
296
296
 
297
297
 
298
+ def _format_units(n: int) -> str:
299
+ """Format large numbers with K, M, B abbreviations for values above 10,000."""
300
+ if n is None:
301
+ return "—"
302
+ if n >= 1000000000: # Billions
303
+ return f"{n / 1000000000:.1f}B"
304
+ elif n >= 1000000: # Millions
305
+ return f"{n / 1000000:.1f}M"
306
+ elif n >= 10000: # Use K for 10,000 and above
307
+ return f"{n / 1000:.0f}K"
308
+ else:
309
+ return str(n)
310
+
311
+
312
+ def _format_pass_fail(passed: int, total: int) -> str:
313
+ """Format pass/fail counts with abbreviated numbers and fractions."""
314
+ if passed is None or total is None or total == 0:
315
+ return "—/—"
316
+
317
+ # Calculate fraction
318
+ fraction = passed / total
319
+
320
+ # Format fraction with special handling for very small and very large values
321
+ if fraction == 0.0:
322
+ fraction_str = "0.00"
323
+ elif fraction == 1.0:
324
+ fraction_str = "1.00"
325
+ elif fraction < 0.005: # Less than 0.005 rounds to 0.00
326
+ fraction_str = "<0.01"
327
+ elif fraction > 0.995: # Greater than 0.995 rounds to 1.00
328
+ fraction_str = ">0.99"
329
+ else:
330
+ fraction_str = f"{fraction:.2f}"
331
+
332
+ # Format absolute number with abbreviations
333
+ absolute_str = _format_units(passed)
334
+
335
+ return f"{absolute_str}/{fraction_str}"
336
+
337
+
298
338
  def _rich_print_scan_table(
299
339
  scan_result: Any,
300
340
  data_source: str,
@@ -314,7 +354,7 @@ def _rich_print_scan_table(
314
354
  total_rows: Total number of rows in the dataset
315
355
  total_columns: Total number of columns in the dataset
316
356
  """
317
- try:
357
+ try: # pragma: no cover
318
358
  import re
319
359
 
320
360
  import narwhals as nw
@@ -556,7 +596,7 @@ def _rich_print_scan_table(
556
596
  console.print()
557
597
  console.print(scan_table)
558
598
 
559
- except Exception as e:
599
+ except Exception as e: # pragma: no cover
560
600
  # Fallback to simple message if table creation fails
561
601
  console.print(f"[yellow]Scan results available for {data_source}[/yellow]")
562
602
  console.print(f"[red]Error displaying table: {str(e)}[/red]")
@@ -725,7 +765,7 @@ def _rich_print_gt_table(
725
765
  # Create header with column name and data type
726
766
  header_text = f"{display_col}\n[dim yellow]{dtype_display}[/dim yellow]"
727
767
  else:
728
- header_text = display_col
768
+ header_text = display_col # pragma: no cover
729
769
 
730
770
  rich_table.add_column(
731
771
  header_text,
@@ -774,7 +814,7 @@ def _rich_print_gt_table(
774
814
  ]
775
815
  for row in data_dict
776
816
  ]
777
- elif hasattr(df, "to_dict"):
817
+ elif hasattr(df, "to_dict"): # pragma: no cover
778
818
  # Pandas-like interface
779
819
  data_dict = df.to_dict("records")
780
820
  if len(columns) > max_terminal_cols:
@@ -808,7 +848,7 @@ def _rich_print_gt_table(
808
848
  ]
809
849
  for row in data_dict
810
850
  ]
811
- elif hasattr(df, "iter_rows"):
851
+ elif hasattr(df, "iter_rows"): # pragma: no cover
812
852
  # Polars lazy frame
813
853
  rows = [
814
854
  [
@@ -822,7 +862,7 @@ def _rich_print_gt_table(
822
862
  ]
823
863
  for row in df.iter_rows()
824
864
  ]
825
- elif hasattr(df, "__iter__"):
865
+ elif hasattr(df, "__iter__"): # pragma: no cover
826
866
  # Try to iterate directly
827
867
  rows = [
828
868
  [
@@ -1031,51 +1071,13 @@ def _display_validation_summary(validation: Any) -> None:
1031
1071
  steps_table.add_column("C", style="red")
1032
1072
  steps_table.add_column("Ext", style="blue", justify="center")
1033
1073
 
1034
- def format_units(n: int) -> str:
1035
- """Format large numbers with K, M, B abbreviations for values above 10,000."""
1036
- if n is None:
1037
- return "—"
1038
- if n >= 1000000000: # Billions
1039
- return f"{n / 1000000000:.1f}B"
1040
- elif n >= 1000000: # Millions
1041
- return f"{n / 1000000:.1f}M"
1042
- elif n >= 10000: # Use K for 10,000 and above
1043
- return f"{n / 1000:.0f}K"
1044
- else:
1045
- return str(n)
1046
-
1047
- def format_pass_fail(passed: int, total: int) -> str:
1048
- """Format pass/fail counts with abbreviated numbers and fractions."""
1049
- if passed is None or total is None or total == 0:
1050
- return "—/—"
1051
-
1052
- # Calculate fraction
1053
- fraction = passed / total
1054
-
1055
- # Format fraction with special handling for very small and very large values
1056
- if fraction == 0.0:
1057
- fraction_str = "0.00"
1058
- elif fraction == 1.0:
1059
- fraction_str = "1.00"
1060
- elif fraction < 0.005: # Less than 0.005 rounds to 0.00
1061
- fraction_str = "<0.01"
1062
- elif fraction > 0.995: # Greater than 0.995 rounds to 1.00
1063
- fraction_str = ">0.99"
1064
- else:
1065
- fraction_str = f"{fraction:.2f}"
1066
-
1067
- # Format absolute number with abbreviations
1068
- absolute_str = format_units(passed)
1069
-
1070
- return f"{absolute_str}/{fraction_str}"
1071
-
1072
1074
  for step in info:
1073
1075
  # Extract values information for the Values column
1074
1076
  values_str = "—" # Default to em dash if no values
1075
1077
 
1076
1078
  # Handle different validation types
1077
1079
  if step.assertion_type == "col_schema_match":
1078
- values_str = "—" # Schema is too complex to display inline
1080
+ values_str = "—" # pragma: no cover
1079
1081
  elif step.assertion_type == "col_vals_between":
1080
1082
  # For between validations, try to get left and right bounds
1081
1083
  if (
@@ -1090,37 +1092,42 @@ def _display_validation_summary(validation: Any) -> None:
1090
1092
  values_str = f"[{step.values[0]}, {step.values[1]}]"
1091
1093
  else:
1092
1094
  values_str = str(step.values)
1093
- elif step.assertion_type in ["row_count_match", "col_count_match"]:
1095
+ elif step.assertion_type in [
1096
+ "row_count_match",
1097
+ "col_count_match",
1098
+ ]: # pragma: no cover
1094
1099
  # For count match validations, extract the 'count' value from the dictionary
1095
- if hasattr(step, "values") and step.values is not None:
1096
- if isinstance(step.values, dict) and "count" in step.values:
1097
- values_str = str(step.values["count"])
1098
- else:
1099
- values_str = str(step.values)
1100
- else:
1101
- values_str = "—"
1100
+ if hasattr(step, "values") and step.values is not None: # pragma: no cover
1101
+ if (
1102
+ isinstance(step.values, dict) and "count" in step.values
1103
+ ): # pragma: no cover
1104
+ values_str = str(step.values["count"]) # pragma: no cover
1105
+ else: # pragma: no cover
1106
+ values_str = str(step.values) # pragma: no cover
1107
+ else: # pragma: no cover
1108
+ values_str = "—" # pragma: no cover
1102
1109
  elif step.assertion_type in ["col_vals_expr", "conjointly"]:
1103
- values_str = "COLUMN EXPR"
1110
+ values_str = "COLUMN EXPR" # pragma: no cover
1104
1111
  elif step.assertion_type == "specially":
1105
- values_str = "EXPR"
1112
+ values_str = "EXPR" # pragma: no cover
1106
1113
  elif hasattr(step, "values") and step.values is not None:
1107
1114
  if isinstance(step.values, (list, tuple)):
1108
1115
  if len(step.values) <= 3:
1109
1116
  values_str = ", ".join(str(v) for v in step.values)
1110
- else:
1111
- values_str = f"{', '.join(str(v) for v in step.values[:3])}..."
1117
+ else: # pragma: no cover
1118
+ values_str = f"{', '.join(str(v) for v in step.values[:3])}..." # pragma: no cover
1112
1119
  else:
1113
1120
  values_str = str(step.values)
1114
1121
  elif hasattr(step, "value") and step.value is not None:
1115
1122
  values_str = str(step.value)
1116
- elif hasattr(step, "set") and step.set is not None:
1117
- if isinstance(step.set, (list, tuple)):
1118
- if len(step.set) <= 3:
1119
- values_str = ", ".join(str(v) for v in step.set)
1120
- else:
1121
- values_str = f"{', '.join(str(v) for v in step.set[:3])}..."
1122
- else:
1123
- values_str = str(step.set)
1123
+ elif hasattr(step, "set") and step.set is not None: # pragma: no cover
1124
+ if isinstance(step.set, (list, tuple)): # pragma: no cover
1125
+ if len(step.set) <= 3: # pragma: no cover
1126
+ values_str = ", ".join(str(v) for v in step.set) # pragma: no cover
1127
+ else: # pragma: no cover
1128
+ values_str = f"{', '.join(str(v) for v in step.set[:3])}..." # pragma: no cover
1129
+ else: # pragma: no cover
1130
+ values_str = str(step.set) # pragma: no cover
1124
1131
 
1125
1132
  # Determine threshold status for W, E, C columns
1126
1133
  # Check if thresholds are set and whether they were exceeded
@@ -1132,10 +1139,10 @@ def _display_validation_summary(validation: Any) -> None:
1132
1139
  and hasattr(step.thresholds, "warning")
1133
1140
  and step.thresholds.warning is not None
1134
1141
  ):
1135
- w_status = (
1136
- "[bright_black]●[/bright_black]"
1137
- if step.warning
1138
- else "[bright_black]○[/bright_black]"
1142
+ w_status = ( # pragma: no cover
1143
+ "[bright_black]●[/bright_black]" # pragma: no cover
1144
+ if step.warning # pragma: no cover
1145
+ else "[bright_black]○[/bright_black]" # pragma: no cover
1139
1146
  )
1140
1147
  else:
1141
1148
  w_status = "—"
@@ -1178,9 +1185,9 @@ def _display_validation_summary(validation: Any) -> None:
1178
1185
  step.assertion_type,
1179
1186
  str(step.column) if step.column else "—",
1180
1187
  values_str,
1181
- format_units(step.n),
1182
- format_pass_fail(step.n_passed, step.n),
1183
- format_pass_fail(step.n - step.n_passed, step.n),
1188
+ _format_units(step.n),
1189
+ _format_pass_fail(step.n_passed, step.n),
1190
+ _format_pass_fail(step.n - step.n_passed, step.n),
1184
1191
  w_status,
1185
1192
  e_status,
1186
1193
  c_status,
@@ -1224,7 +1231,7 @@ def _display_validation_summary(validation: Any) -> None:
1224
1231
  console.print("[yellow]Validation object does not contain validation results.[/yellow]")
1225
1232
 
1226
1233
  except Exception as e: # pragma: no cover
1227
- console.print(f"[red]Error displaying validation summary:[/red] {e}")
1234
+ console.print(f"[red]Error displaying validation summary:[/red] {e}") # pragma: no cover
1228
1235
  import traceback # pragma: no cover
1229
1236
 
1230
1237
  console.print(f"[dim]{traceback.format_exc()}[/dim]") # pragma: no cover
@@ -1372,24 +1379,26 @@ def preview(
1372
1379
 
1373
1380
  # Handle piped input
1374
1381
  if data_source is None:
1375
- if not sys.stdin.isatty():
1382
+ if not sys.stdin.isatty(): # pragma: no cover
1376
1383
  # Data is being piped in - read the file path from stdin
1377
- piped_input = sys.stdin.read().strip()
1378
- if piped_input:
1379
- data_source = piped_input
1384
+ piped_input = sys.stdin.read().strip() # pragma: no cover
1385
+ if piped_input: # pragma: no cover
1386
+ data_source = piped_input # pragma: no cover
1380
1387
 
1381
1388
  # Determine the format from the file extension
1382
- if piped_input.endswith(".parquet"):
1383
- format_type = "Parquet"
1384
- elif piped_input.endswith(".csv"):
1385
- format_type = "CSV"
1386
- else:
1387
- format_type = "unknown"
1389
+ if piped_input.endswith(".parquet"): # pragma: no cover
1390
+ format_type = "Parquet" # pragma: no cover
1391
+ elif piped_input.endswith(".csv"): # pragma: no cover
1392
+ format_type = "CSV" # pragma: no cover
1393
+ else: # pragma: no cover
1394
+ format_type = "unknown" # pragma: no cover
1388
1395
 
1389
- console.print(f"[dim]Using piped data source in {format_type} format.[/dim]")
1390
- else:
1391
- console.print("[red]Error:[/red] No data provided via pipe")
1392
- sys.exit(1)
1396
+ console.print(
1397
+ f"[dim]Using piped data source in {format_type} format.[/dim]"
1398
+ ) # pragma: no cover
1399
+ else: # pragma: no cover
1400
+ console.print("[red]Error:[/red] No data provided via pipe") # pragma: no cover
1401
+ sys.exit(1) # pragma: no cover
1393
1402
  else:
1394
1403
  # Show concise help and exit
1395
1404
  _show_concise_help("preview", None)
@@ -1742,24 +1751,26 @@ def missing(data_source: str | None, output_html: str | None):
1742
1751
 
1743
1752
  # Handle piped input
1744
1753
  if data_source is None:
1745
- if not sys.stdin.isatty():
1754
+ if not sys.stdin.isatty(): # pragma: no cover
1746
1755
  # Data is being piped in - read the file path from stdin
1747
- piped_input = sys.stdin.read().strip()
1748
- if piped_input:
1749
- data_source = piped_input
1756
+ piped_input = sys.stdin.read().strip() # pragma: no cover
1757
+ if piped_input: # pragma: no cover
1758
+ data_source = piped_input # pragma: no cover
1750
1759
 
1751
1760
  # Determine the format from the file extension
1752
- if piped_input.endswith(".parquet"):
1753
- format_type = "Parquet"
1754
- elif piped_input.endswith(".csv"):
1755
- format_type = "CSV"
1756
- else:
1757
- format_type = "unknown"
1761
+ if piped_input.endswith(".parquet"): # pragma: no cover
1762
+ format_type = "Parquet" # pragma: no cover
1763
+ elif piped_input.endswith(".csv"): # pragma: no cover
1764
+ format_type = "CSV" # pragma: no cover
1765
+ else: # pragma: no cover
1766
+ format_type = "unknown" # pragma: no cover
1758
1767
 
1759
- console.print(f"[dim]Using piped data source in {format_type} format.[/dim]")
1760
- else:
1761
- console.print("[red]Error:[/red] No data provided via pipe")
1762
- sys.exit(1)
1768
+ console.print(
1769
+ f"[dim]Using piped data source in {format_type} format.[/dim]"
1770
+ ) # pragma: no cover
1771
+ else: # pragma: no cover
1772
+ console.print("[red]Error:[/red] No data provided via pipe") # pragma: no cover
1773
+ sys.exit(1) # pragma: no cover
1763
1774
  else:
1764
1775
  # Show concise help and exit
1765
1776
  _show_concise_help("missing", None)
@@ -2027,24 +2038,26 @@ def validate(
2027
2038
  # or if we have piped input
2028
2039
  if data_source is None:
2029
2040
  # Check if we have piped input
2030
- if not sys.stdin.isatty():
2041
+ if not sys.stdin.isatty(): # pragma: no cover
2031
2042
  # Data is being piped in: read the file path from stdin
2032
- piped_input = sys.stdin.read().strip()
2033
- if piped_input:
2034
- data_source = piped_input
2043
+ piped_input = sys.stdin.read().strip() # pragma: no cover
2044
+ if piped_input: # pragma: no cover
2045
+ data_source = piped_input # pragma: no cover
2035
2046
 
2036
2047
  # Determine the format from the file extension
2037
- if piped_input.endswith(".parquet"):
2038
- format_type = "Parquet"
2039
- elif piped_input.endswith(".csv"):
2040
- format_type = "CSV"
2041
- else:
2042
- format_type = "unknown"
2048
+ if piped_input.endswith(".parquet"): # pragma: no cover
2049
+ format_type = "Parquet" # pragma: no cover
2050
+ elif piped_input.endswith(".csv"): # pragma: no cover
2051
+ format_type = "CSV" # pragma: no cover
2052
+ else: # pragma: no cover
2053
+ format_type = "unknown" # pragma: no cover
2043
2054
 
2044
- console.print(f"[dim]Using piped data source in {format_type} format.[/dim]")
2045
- else:
2046
- console.print("[red]Error:[/red] No data provided via pipe")
2047
- sys.exit(1)
2055
+ console.print(
2056
+ f"[dim]Using piped data source in {format_type} format.[/dim]"
2057
+ ) # pragma: no cover
2058
+ else: # pragma: no cover
2059
+ console.print("[red]Error:[/red] No data provided via pipe") # pragma: no cover
2060
+ sys.exit(1) # pragma: no cover
2048
2061
  else:
2049
2062
  # Show concise help and exit
2050
2063
  _show_concise_help("validate", None)
pointblank/column.py CHANGED
@@ -219,7 +219,7 @@ class ColumnSelectorNarwhals(Column):
219
219
  # Use `collect_schema()` for LazyFrame to avoid performance warnings
220
220
  if hasattr(selected_df, "collect_schema"):
221
221
  return list(selected_df.collect_schema().keys())
222
- else:
222
+ else: # pragma: no cover
223
223
  return list(selected_df.columns)
224
224
 
225
225