pointblank 0.17.0__py3-none-any.whl → 0.18.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pointblank/datascan.py CHANGED
@@ -3,12 +3,11 @@ from __future__ import annotations
3
3
  import contextlib
4
4
  import json
5
5
  from importlib.metadata import version
6
- from typing import TYPE_CHECKING, Any
6
+ from typing import TYPE_CHECKING, Any, cast
7
7
 
8
8
  import narwhals as nw
9
9
  from great_tables import GT, google_font, html, loc, style
10
10
  from narwhals.dataframe import LazyFrame
11
- from narwhals.typing import FrameT
12
11
 
13
12
  from pointblank._utils_html import _create_table_dims_html, _create_table_type_html, _fmt_frac
14
13
  from pointblank.scan_profile import ColumnProfile, _as_physical, _DataProfile, _TypeMap
@@ -18,7 +17,7 @@ if TYPE_CHECKING:
18
17
  from collections.abc import Mapping, Sequence
19
18
 
20
19
  from narwhals.dataframe import DataFrame
21
- from narwhals.typing import Frame, IntoFrameT
20
+ from narwhals.typing import Frame
22
21
 
23
22
  from pointblank.scan_profile_stats import StatGroup
24
23
 
@@ -123,7 +122,7 @@ class DataScan:
123
122
  """
124
123
 
125
124
  # TODO: This needs to be generically typed at the class level, ie. DataScan[T]
126
- def __init__(self, data: IntoFrameT, tbl_name: str | None = None) -> None:
125
+ def __init__(self, data: Any, tbl_name: str | None = None) -> None:
127
126
  # Import processing functions from validate module
128
127
  from pointblank.validate import (
129
128
  _process_data,
@@ -172,7 +171,7 @@ class DataScan:
172
171
  implementation=self.nw_data.implementation,
173
172
  )
174
173
  for column in columns:
175
- col_data: DataFrame = self.nw_data.select(column)
174
+ col_data: Frame = self.nw_data.select(column)
176
175
 
177
176
  ## Handle dtyping:
178
177
  native_dtype = schema[column]
@@ -183,7 +182,7 @@ class DataScan:
183
182
  except NotImplementedError:
184
183
  continue
185
184
 
186
- col_profile = ColumnProfile(colname=column, coltype=native_dtype)
185
+ col_profile = ColumnProfile(colname=column, coltype=str(native_dtype))
187
186
 
188
187
  ## Collect Sample Data:
189
188
  ## This is the most consistent way (i think) to get the samples out of the data.
@@ -205,7 +204,7 @@ class DataScan:
205
204
  return profile
206
205
 
207
206
  @property
208
- def summary_data(self) -> IntoFrameT:
207
+ def summary_data(self) -> Any:
209
208
  return self.profile.as_dataframe(strict=False).to_native()
210
209
 
211
210
  def get_tabular_report(self, *, show_sample_data: bool = False) -> GT:
@@ -318,11 +317,10 @@ class DataScan:
318
317
 
319
318
  # format fractions:
320
319
  # this is an anti-pattern but there's no serious alternative
320
+ _backend = cast(Any, self.profile.implementation)
321
321
  for _fmt_col in ("__frac_n_unique", "__frac_n_missing"):
322
322
  _formatted: list[str | None] = _fmt_frac(formatted_data[_fmt_col])
323
- formatted: nw.Series = nw.new_series(
324
- _fmt_col, values=_formatted, backend=self.profile.implementation
325
- )
323
+ formatted: nw.Series = nw.new_series(_fmt_col, values=_formatted, backend=_backend)
326
324
  formatted_data = formatted_data.drop(_fmt_col)
327
325
  formatted_data = formatted_data.with_columns(formatted.alias(_fmt_col))
328
326
 
@@ -365,10 +363,10 @@ class DataScan:
365
363
  trues.append(None)
366
364
  falses.append(None)
367
365
  true_ser: nw.Series = nw.new_series(
368
- name="__freq_true", values=trues, backend=self.profile.implementation
366
+ name="__freq_true", values=trues, backend=_backend
369
367
  )
370
368
  false_ser: nw.Series = nw.new_series(
371
- name="__freq_false", values=falses, backend=self.profile.implementation
369
+ name="__freq_false", values=falses, backend=_backend
372
370
  )
373
371
  formatted_data = formatted_data.with_columns(
374
372
  __freq_true=true_ser, __freq_false=false_ser
@@ -382,9 +380,7 @@ class DataScan:
382
380
  )
383
381
  for _fmt_col in ("__pct_true", "__pct_false"):
384
382
  _formatted: list[str | None] = _fmt_frac(formatted_data[_fmt_col])
385
- formatted = nw.new_series(
386
- name=_fmt_col, values=_formatted, backend=self.profile.implementation
387
- )
383
+ formatted = nw.new_series(name=_fmt_col, values=_formatted, backend=_backend)
388
384
  formatted_data = formatted_data.drop(_fmt_col)
389
385
  formatted_data = formatted_data.with_columns(formatted.alias(_fmt_col))
390
386
 
@@ -459,7 +455,11 @@ class DataScan:
459
455
  )
460
456
  .tab_style(style=style.text(size="12px"), locations=loc.body(columns="colname"))
461
457
  .cols_width(
462
- icon="35px", colname="200px", **{stat_col: "60px" for stat_col in present_stat_cols}
458
+ cases={
459
+ "icon": "35px",
460
+ "colname": "200px",
461
+ **{stat_col: "60px" for stat_col in present_stat_cols},
462
+ }
463
463
  )
464
464
  )
465
465
 
@@ -498,7 +498,7 @@ class DataScan:
498
498
  json.dump(json_string, f, indent=4)
499
499
 
500
500
 
501
- def col_summary_tbl(data: FrameT | Any, tbl_name: str | None = None) -> GT:
501
+ def col_summary_tbl(data: Any, tbl_name: str | None = None) -> GT:
502
502
  """
503
503
  Generate a column-level summary table of a dataset.
504
504
 
pointblank/draft.py CHANGED
@@ -4,7 +4,6 @@ from dataclasses import dataclass, field
4
4
  from typing import Any
5
5
 
6
6
  from importlib_resources import files
7
- from narwhals.typing import FrameT
8
7
 
9
8
  from pointblank._constants import MODEL_PROVIDERS
10
9
  from pointblank.datascan import DataScan
@@ -223,7 +222,7 @@ class DraftValidation:
223
222
  be replaced with the actual data variable.
224
223
  """
225
224
 
226
- data: FrameT | Any
225
+ data: Any
227
226
  model: str
228
227
  api_key: str | None = None
229
228
  verify_ssl: bool = True
@@ -328,7 +327,7 @@ class DraftValidation:
328
327
  if provider == "anthropic": # pragma: no cover
329
328
  # Check that the anthropic package is installed
330
329
  try:
331
- import anthropic # noqa
330
+ import anthropic # noqa # type: ignore[import-not-found]
332
331
  except ImportError: # pragma: no cover
333
332
  raise ImportError( # pragma: no cover
334
333
  "The `anthropic` package is required to use the `DraftValidation` class with "
@@ -5,7 +5,7 @@ from collections import defaultdict
5
5
  from collections.abc import Sequence
6
6
  from dataclasses import dataclass, field
7
7
  from enum import Enum
8
- from typing import TYPE_CHECKING, Any
8
+ from typing import TYPE_CHECKING, Any, ClassVar
9
9
 
10
10
  import narwhals as nw
11
11
  from narwhals.dataframe import DataFrame
@@ -96,6 +96,7 @@ class ColumnProfile(_ColumnProfileABC):
96
96
  colname: str
97
97
  coltype: str
98
98
  statistics: MutableSequence[Stat] = field(default_factory=lambda: [])
99
+ _type: ClassVar[_TypeMap] # Defined by subclasses
99
100
 
100
101
  @property
101
102
  def sample_data(self) -> Sequence[Any]:
pointblank/schema.py CHANGED
@@ -2,12 +2,16 @@ from __future__ import annotations
2
2
 
3
3
  import copy
4
4
  from dataclasses import dataclass
5
+ from typing import TYPE_CHECKING
5
6
 
6
7
  import narwhals as nw
7
8
 
8
9
  from pointblank._constants import IBIS_BACKENDS
9
10
  from pointblank._utils import _get_tbl_type, _is_lazy_frame, _is_lib_present, _is_narwhals_table
10
11
 
12
+ if TYPE_CHECKING:
13
+ from typing import Any
14
+
11
15
  __all__ = ["Schema", "_check_schema_match"]
12
16
 
13
17
 
@@ -269,17 +273,15 @@ class Schema:
269
273
  `Schema` object is used in a validation workflow.
270
274
  """
271
275
 
272
- columns: str | list[str] | list[tuple[str, str]] | list[tuple[str]] | dict[str, str] | None = (
273
- None
274
- )
275
- tbl: any | None = None
276
+ columns: list[tuple[str, ...]] | None = None
277
+ tbl: Any | None = None
276
278
 
277
279
  def __init__(
278
280
  self,
279
281
  columns: (
280
282
  str | list[str] | list[tuple[str, str]] | list[tuple[str]] | dict[str, str] | None
281
283
  ) = None,
282
- tbl: any | None = None,
284
+ tbl: Any | None = None,
283
285
  **kwargs,
284
286
  ):
285
287
  if tbl is None and columns is None and not kwargs:
@@ -387,6 +389,8 @@ class Schema:
387
389
  bool
388
390
  True if the columns are the same, False otherwise.
389
391
  """
392
+ if self.columns is None or other.columns is None:
393
+ return self.columns is None and other.columns is None
390
394
 
391
395
  if not case_sensitive_colnames:
392
396
  this_column_list = [col.lower() for col in self.get_column_list()]
@@ -463,6 +467,8 @@ class Schema:
463
467
  bool
464
468
  True if the columns are the same, False otherwise.
465
469
  """
470
+ if self.columns is None or other.columns is None:
471
+ return self.columns is None and other.columns is None
466
472
 
467
473
  if not case_sensitive_colnames:
468
474
  this_column_list = [col.lower() for col in self.get_column_list()]
@@ -547,6 +553,8 @@ class Schema:
547
553
  bool
548
554
  True if the columns are the same, False otherwise.
549
555
  """
556
+ if self.columns is None or other.columns is None:
557
+ return self.columns is None and other.columns is None
550
558
 
551
559
  if not case_sensitive_colnames:
552
560
  this_column_list = [col.lower() for col in self.get_column_list()]
@@ -633,6 +641,8 @@ class Schema:
633
641
  bool
634
642
  True if the columns are the same, False otherwise.
635
643
  """
644
+ if self.columns is None or other.columns is None:
645
+ return self.columns is None and other.columns is None
636
646
 
637
647
  if not case_sensitive_colnames:
638
648
  this_column_list = [col.lower() for col in self.get_column_list()]
@@ -702,6 +712,8 @@ class Schema:
702
712
  list[str]
703
713
  A list of column names.
704
714
  """
715
+ if self.columns is None:
716
+ return []
705
717
  return [col[0] for col in self.columns]
706
718
 
707
719
  def get_dtype_list(self) -> list[str]:
@@ -713,9 +725,11 @@ class Schema:
713
725
  list[str]
714
726
  A list of data types.
715
727
  """
728
+ if self.columns is None:
729
+ return []
716
730
  return [col[1] for col in self.columns]
717
731
 
718
- def get_schema_coerced(self, to: str | None = None) -> dict[str, str]:
732
+ def get_schema_coerced(self, to: str | None = None) -> Schema:
719
733
  # If a table isn't provided, we cannot use this method
720
734
  if self.tbl is None:
721
735
  raise ValueError(
@@ -755,8 +769,15 @@ class Schema:
755
769
  new_schema = copy.deepcopy(Schema(tbl=(self.tbl.to_pandas())))
756
770
  return new_schema
757
771
 
772
+ raise ValueError(
773
+ f"Cannot coerce schema from '{self.tbl_type}' to '{to}'. "
774
+ "Supported conversions: pandas->polars, polars->pandas."
775
+ )
776
+
758
777
  def __str__(self):
759
778
  formatted_columns = []
779
+ if self.columns is None:
780
+ return "Pointblank Schema (empty)"
760
781
  for col in self.columns:
761
782
  if len(col) == 1: # Only column name provided (no data type)
762
783
  formatted_columns.append(f" {col[0]}: <ANY>")
@@ -770,8 +791,15 @@ class Schema:
770
791
 
771
792
 
772
793
  def _process_columns(
773
- *, columns: str | list[str] | list[tuple[str, str]] | dict[str, str] | None = None, **kwargs
774
- ) -> list[tuple[str, str]]:
794
+ *,
795
+ columns: str
796
+ | list[str]
797
+ | list[tuple[str, str]]
798
+ | list[tuple[str]]
799
+ | dict[str, str]
800
+ | None = None,
801
+ **kwargs,
802
+ ) -> list[tuple[str, ...]]:
775
803
  """
776
804
  Process column information provided as individual arguments or as a list of
777
805
  tuples/dictionary.
@@ -785,15 +813,18 @@ def _process_columns(
785
813
 
786
814
  Returns
787
815
  -------
788
- list[tuple[str, str]]
789
- A list of tuples containing column information.
816
+ list[tuple[str, ...]]
817
+ A list of tuples containing column information (name only or name and dtype).
790
818
  """
791
819
  if columns is not None:
792
820
  if isinstance(columns, list):
793
821
  if all(isinstance(col, str) for col in columns):
794
- return [(col,) for col in columns]
822
+ # Type narrowing: after the all() check, columns contains only strings
823
+ str_columns: list[str] = columns # type: ignore[assignment]
824
+ return [(col,) for col in str_columns]
795
825
  else:
796
- return columns
826
+ # Type narrowing: columns contains tuples
827
+ return columns # type: ignore[return-value]
797
828
 
798
829
  if isinstance(columns, str):
799
830
  return [(columns,)]
@@ -810,11 +841,11 @@ def _schema_info_generate_colname_dict(
810
841
  index_matched: bool,
811
842
  matched_to: str | None,
812
843
  dtype_present: bool,
813
- dtype_input: str | list[str],
844
+ dtype_input: str | list[str] | None,
814
845
  dtype_matched: bool,
815
846
  dtype_multiple: bool,
816
- dtype_matched_pos: int,
817
- ) -> dict[str, any]:
847
+ dtype_matched_pos: int | None,
848
+ ) -> dict[str, Any]:
818
849
  return {
819
850
  "colname_matched": colname_matched,
820
851
  "index_matched": index_matched,
@@ -829,8 +860,8 @@ def _schema_info_generate_colname_dict(
829
860
 
830
861
  def _schema_info_generate_columns_dict(
831
862
  colnames: list[str] | None,
832
- colname_dict: list[dict[str, any]] | None,
833
- ) -> dict[str, dict[str, any]]:
863
+ colname_dict: list[dict[str, Any]] | None,
864
+ ) -> dict[str, dict[str, Any]]:
834
865
  """
835
866
  Generate the columns dictionary for the schema information dictionary.
836
867
 
@@ -847,6 +878,7 @@ def _schema_info_generate_columns_dict(
847
878
  dict[str, dict[str, any]]
848
879
  The columns dictionary.
849
880
  """
881
+ assert colnames is not None and colname_dict is not None
850
882
  return {colnames[i]: colname_dict[i] for i in range(len(colnames))}
851
883
 
852
884
 
@@ -856,7 +888,7 @@ def _schema_info_generate_params_dict(
856
888
  case_sensitive_colnames: bool,
857
889
  case_sensitive_dtypes: bool,
858
890
  full_match_dtypes: bool,
859
- ) -> dict[str, any]:
891
+ ) -> dict[str, Any]:
860
892
  """
861
893
  Generate the parameters dictionary for the schema information dictionary.
862
894
 
@@ -889,7 +921,7 @@ def _schema_info_generate_params_dict(
889
921
 
890
922
 
891
923
  def _get_schema_validation_info(
892
- data_tbl: any,
924
+ data_tbl: Any,
893
925
  schema: Schema,
894
926
  passed: bool,
895
927
  complete: bool,
@@ -897,7 +929,7 @@ def _get_schema_validation_info(
897
929
  case_sensitive_colnames: bool,
898
930
  case_sensitive_dtypes: bool,
899
931
  full_match_dtypes: bool,
900
- ) -> dict[str, any]:
932
+ ) -> dict[str, Any]:
901
933
  """
902
934
  Get the schema validation information dictionary.
903
935
 
@@ -949,6 +981,10 @@ def _get_schema_validation_info(
949
981
  schema_exp = schema
950
982
  schema_tgt = Schema(tbl=data_tbl)
951
983
 
984
+ # Both schemas must have columns for validation
985
+ assert schema_exp.columns is not None, "Expected schema must have columns"
986
+ assert schema_tgt.columns is not None, "Target schema must have columns"
987
+
952
988
  # Initialize the schema information dictionary
953
989
  schema_info = {
954
990
  "passed": passed,
@@ -1122,6 +1158,11 @@ def _get_schema_validation_info(
1122
1158
  #
1123
1159
 
1124
1160
  if colname_matched and dtype_present:
1161
+ # Type narrowing: matched_to is not None when colname_matched is True
1162
+ # and dtype_input is not None when dtype_present is True
1163
+ assert matched_to is not None
1164
+ assert dtype_input is not None
1165
+
1125
1166
  # Get the dtype of the column in the target table
1126
1167
  dtype_tgt = schema_tgt.columns[tgt_colnames.index(matched_to)][1]
1127
1168
 
pointblank/thresholds.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from dataclasses import dataclass, field
4
- from typing import Callable
4
+ from typing import Callable, cast
5
5
 
6
6
  __all__ = ["Thresholds", "Actions", "FinalActions"]
7
7
 
@@ -180,14 +180,15 @@ class Thresholds:
180
180
  # The threshold value might be an absolute count, but we need to convert
181
181
  # it to a fractional value
182
182
  if isinstance(threshold_value, int):
183
- threshold_value = _convert_abs_count_to_fraction(
184
- value=threshold_value, test_units=test_units
185
- )
183
+ converted = _convert_abs_count_to_fraction(value=threshold_value, test_units=test_units)
184
+ if converted is None:
185
+ return None
186
+ threshold_value = converted
186
187
 
187
188
  return fraction_failing >= threshold_value
188
189
 
189
190
 
190
- def _convert_abs_count_to_fraction(value: int | None, test_units: int) -> float:
191
+ def _convert_abs_count_to_fraction(value: int | None, test_units: int) -> float | None:
191
192
  # Using a integer value signifying the total number of 'test units' (in the
192
193
  # context of a validation), we convert an integer count (absolute) threshold
193
194
  # value to a fractional threshold value
@@ -251,12 +252,12 @@ def _normalize_thresholds_creation(
251
252
  # any of these keys
252
253
 
253
254
  # Check keys for invalid entries and raise a ValueError if any are found
254
- invalid_keys = set(thresholds.keys()) - {"warning", "error", "critical"}
255
+ invalid_keys: set = set(thresholds.keys()) - {"warning", "error", "critical"}
255
256
 
256
257
  if invalid_keys:
257
258
  raise ValueError(f"Invalid keys in the thresholds dictionary: {invalid_keys}")
258
259
 
259
- thresholds = Thresholds(**thresholds)
260
+ thresholds = Thresholds(**cast(dict[str, int | float | None], thresholds))
260
261
 
261
262
  elif isinstance(thresholds, Thresholds):
262
263
  pass
@@ -483,12 +484,12 @@ class Actions:
483
484
 
484
485
  def _ensure_list(
485
486
  self, value: str | Callable | list[str | Callable] | None
486
- ) -> list[str | Callable]:
487
+ ) -> list[str | Callable] | None:
487
488
  if value is None:
488
489
  return None
489
- if not isinstance(value, list):
490
- return [value]
491
- return value
490
+ if isinstance(value, list):
491
+ return cast(list[str | Callable], value)
492
+ return [value]
492
493
 
493
494
  def __repr__(self) -> str:
494
495
  return f"Actions(warning={self.warning}, error={self.error}, critical={self.critical})"
@@ -627,13 +628,14 @@ class FinalActions:
627
628
  def __repr__(self) -> str:
628
629
  if isinstance(self.actions, list):
629
630
  action_reprs = ", ".join(
630
- f"'{a}'" if isinstance(a, str) else a.__name__ for a in self.actions
631
+ f"'{a}'" if isinstance(a, str) else getattr(a, "__name__", repr(a))
632
+ for a in self.actions
631
633
  )
632
634
  return f"FinalActions([{action_reprs}])"
633
635
  elif isinstance(self.actions, str):
634
636
  return f"FinalActions('{self.actions}')"
635
637
  elif callable(self.actions):
636
- return f"FinalActions({self.actions.__name__})"
638
+ return f"FinalActions({getattr(self.actions, '__name__', repr(self.actions))})"
637
639
  else:
638
640
  return f"FinalActions({self.actions})" # pragma: no cover
639
641