pointblank 0.9.6__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,321 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+ from collections import defaultdict
5
+ from collections.abc import Sequence
6
+ from dataclasses import dataclass, field
7
+ from enum import Enum
8
+ from typing import TYPE_CHECKING, Any
9
+
10
+ import narwhals as nw
11
+ from narwhals.dataframe import DataFrame
12
+
13
+ from pointblank._constants import SVG_ICONS_FOR_DATA_TYPES
14
+ from pointblank._utils import transpose_dicts
15
+ from pointblank.scan_profile_stats import (
16
+ FreqStat,
17
+ IQRStat,
18
+ MaxStat,
19
+ MeanStat,
20
+ MedianStat,
21
+ MinStat,
22
+ NMissing,
23
+ NUnique,
24
+ P05Stat,
25
+ P95Stat,
26
+ Q1Stat,
27
+ Q3Stat,
28
+ Stat,
29
+ StdStat,
30
+ )
31
+
32
+ if TYPE_CHECKING:
33
+ from collections.abc import MutableSequence
34
+
35
+ from narwhals.typing import Frame
36
+
37
+
38
+ ## Types that may cause unrecoverable errors and don't pose any value
39
+ ILLEGAL_TYPES = ("struct",)
40
+
41
+
42
+ class _TypeMap(Enum): # ! ordered;
43
+ # TODO: consolidate w/other stats?
44
+ NUMERIC = ("int", "float")
45
+ STRING = ("string", "categorical")
46
+ DATE = ("date",)
47
+ BOOL = ("bool",)
48
+
49
+ @classmethod
50
+ def is_illegal(cls, dtype: Any) -> bool:
51
+ return any(ind for ind in ILLEGAL_TYPES if ind in str(dtype).lower())
52
+
53
+ @classmethod
54
+ def fetch_prof_map(cls) -> dict[_TypeMap, type[ColumnProfile]]:
55
+ default = defaultdict(lambda: ColumnProfile)
56
+ implemented_dict: dict[_TypeMap, type[ColumnProfile]] = {
57
+ cls.BOOL: _BoolProfile,
58
+ cls.NUMERIC: _NumericProfile,
59
+ cls.STRING: _StringProfile,
60
+ cls.DATE: _DateProfile,
61
+ }
62
+ return default | implemented_dict
63
+
64
+ @classmethod
65
+ def fetch_profile(cls, dtype: Any) -> type[ColumnProfile]:
66
+ stringified: str = str(dtype).lower()
67
+ for _type in cls:
68
+ inds: tuple[str, ...] = _type.value
69
+ is_match: bool = any(ind for ind in inds if ind in stringified)
70
+ if is_match:
71
+ return cls.fetch_prof_map()[_type]
72
+ raise NotImplementedError # pragma: no-cover
73
+
74
+ @classmethod
75
+ def fetch_icon(cls, _type: _TypeMap) -> str:
76
+ icon_map = {
77
+ cls.NUMERIC: "numeric",
78
+ cls.STRING: "string",
79
+ cls.DATE: "date",
80
+ cls.BOOL: "boolean",
81
+ }
82
+ try:
83
+ icon_key = icon_map[_type]
84
+ except KeyError:
85
+ icon_key = "object"
86
+ return SVG_ICONS_FOR_DATA_TYPES[icon_key]
87
+
88
+
89
+ class _ColumnProfileABC(ABC):
90
+ @abstractmethod
91
+ def calc_stats(self, data: Frame) -> None: ...
92
+
93
+
94
+ @dataclass
95
+ class ColumnProfile(_ColumnProfileABC):
96
+ colname: str
97
+ coltype: str
98
+ statistics: MutableSequence[Stat] = field(default_factory=lambda: [])
99
+
100
+ @property
101
+ def sample_data(self) -> Sequence[Any]:
102
+ return self._sample_data
103
+
104
+ @sample_data.setter
105
+ def sample_data(self, value: object) -> None:
106
+ if isinstance(value, Sequence):
107
+ self._sample_data = value
108
+ return
109
+ raise NotImplementedError # pragma: no cover
110
+
111
+ def spawn_profile(self, _subprofile: type[ColumnProfile]) -> ColumnProfile:
112
+ inst = _subprofile(coltype=self.coltype, colname=self.colname, statistics=self.statistics)
113
+ # instantiate non-initializing properties
114
+ inst.sample_data = self.sample_data
115
+ return inst
116
+
117
+ def calc_stats(self, data: Frame) -> None:
118
+ summarized = _as_physical(
119
+ data.select(_col=self.colname).select(_nmissing=NMissing.expr, _nunique=NUnique.expr)
120
+ ).to_dict()
121
+
122
+ self.statistics.extend(
123
+ [
124
+ NMissing(summarized["_nmissing"].item()),
125
+ NUnique(summarized["_nunique"].item()),
126
+ ]
127
+ )
128
+
129
+
130
+ class _DateProfile(ColumnProfile):
131
+ _type: _TypeMap = _TypeMap.DATE
132
+
133
+ def calc_stats(self, data: Frame):
134
+ res = data.rename({self.colname: "_col"}).select(_min=MinStat.expr, _max=MaxStat.expr)
135
+
136
+ physical = _as_physical(res).to_dict()
137
+
138
+ self.statistics.extend(
139
+ [
140
+ MinStat(physical["_min"].item()),
141
+ MaxStat(physical["_max"].item()),
142
+ ]
143
+ )
144
+
145
+
146
+ class _BoolProfile(ColumnProfile):
147
+ _type: _TypeMap = _TypeMap.BOOL
148
+
149
+ def calc_stats(self, data: Frame) -> None:
150
+ group_by_contexts = (
151
+ data.rename({self.colname: "_col"}).group_by("_col").agg(_freq=FreqStat.expr)
152
+ )
153
+
154
+ summarized_groupby = _as_physical(group_by_contexts).to_dict()
155
+
156
+ # TODO: Need a real way to do this
157
+ col_vals: list[Any] = summarized_groupby["_col"].to_list()
158
+ freqs: list[int] = summarized_groupby["_freq"].to_list()
159
+
160
+ freq_dict: dict[str, int] = {
161
+ str(colval): freq for colval, freq in zip(col_vals, freqs, strict=True)
162
+ }
163
+
164
+ self.statistics.extend([FreqStat(freq_dict)])
165
+
166
+
167
+ class _StringProfile(ColumnProfile):
168
+ _type: _TypeMap = _TypeMap.STRING
169
+
170
+ def calc_stats(self, data: Frame):
171
+ str_data = data.select(nw.all().cast(nw.String).str.len_chars())
172
+
173
+ # TODO: We should get an FreqStat here; estimate cardinality first
174
+
175
+ summarized = (
176
+ str_data.rename({self.colname: "_col"})
177
+ .select(
178
+ _mean=MeanStat.expr,
179
+ _median=MedianStat.expr,
180
+ _std=StdStat.expr,
181
+ _min=MinStat.expr,
182
+ _max=MaxStat.expr,
183
+ _p_05=P05Stat.expr,
184
+ _q_1=Q1Stat.expr,
185
+ _q_3=Q3Stat.expr,
186
+ _p_95=P95Stat.expr,
187
+ )
188
+ .with_columns(
189
+ _iqr=IQRStat.expr,
190
+ )
191
+ )
192
+
193
+ physical = _as_physical(summarized).to_dict()
194
+ self.statistics.extend(
195
+ [
196
+ MeanStat(physical["_mean"].item()),
197
+ MedianStat(physical["_median"].item()),
198
+ StdStat(physical["_std"].item()),
199
+ MinStat(physical["_min"].item()),
200
+ MaxStat(physical["_max"].item()),
201
+ P05Stat(physical["_p_05"].item()),
202
+ Q1Stat(physical["_q_1"].item()),
203
+ Q3Stat(physical["_q_3"].item()),
204
+ P95Stat(physical["_p_95"].item()),
205
+ IQRStat(physical["_iqr"].item()),
206
+ ]
207
+ )
208
+
209
+
210
+ class _NumericProfile(ColumnProfile):
211
+ _type: _TypeMap = _TypeMap.NUMERIC
212
+
213
+ def calc_stats(self, data: Frame):
214
+ res = (
215
+ data.rename({self.colname: "_col"})
216
+ .select(
217
+ _mean=MeanStat.expr,
218
+ _median=MedianStat.expr,
219
+ _std=StdStat.expr,
220
+ _min=MinStat.expr,
221
+ _max=MaxStat.expr,
222
+ _p_05=P05Stat.expr,
223
+ _q_1=Q1Stat.expr,
224
+ _q_3=Q3Stat.expr,
225
+ _p_95=P95Stat.expr,
226
+ )
227
+ # TODO: need a consistent way to indicate this
228
+ .with_columns(_iqr=IQRStat.expr)
229
+ )
230
+
231
+ summarized = _as_physical(res).to_dict()
232
+ self.statistics.extend(
233
+ [
234
+ MeanStat(summarized["_mean"].item()),
235
+ MedianStat(summarized["_median"].item()),
236
+ StdStat(summarized["_std"].item()),
237
+ MinStat(summarized["_min"].item()),
238
+ MaxStat(summarized["_max"].item()),
239
+ P05Stat(summarized["_p_05"].item()),
240
+ Q1Stat(summarized["_q_1"].item()),
241
+ Q3Stat(summarized["_q_3"].item()),
242
+ P95Stat(summarized["_p_95"].item()),
243
+ IQRStat(summarized["_iqr"].item()),
244
+ ]
245
+ )
246
+
247
+
248
+ class _DataProfile: # TODO: feels redundant and weird
249
+ def __init__(
250
+ self,
251
+ table_name: str | None,
252
+ columns: list[str],
253
+ implementation: nw.Implementation,
254
+ ):
255
+ self.table_name: str | None = table_name
256
+ self.columns: list[str] = columns
257
+ self.implementation = implementation
258
+ self.column_profiles: list[ColumnProfile] = []
259
+
260
+ def set_row_count(self, data: Frame) -> None:
261
+ assert self.columns # internal: cols should already be set
262
+
263
+ slim = data.select(nw.col(self.columns[0]))
264
+
265
+ physical = _as_physical(slim)
266
+
267
+ self.row_count = len(physical)
268
+
269
+ def as_dataframe(self, *, strict: bool = True) -> DataFrame:
270
+ assert self.column_profiles
271
+
272
+ cols: list[dict[str, Any]] = []
273
+ for prof in self.column_profiles:
274
+ stat_vals = {}
275
+ for stat in prof.statistics:
276
+ stat_vals[stat.name] = stat.val
277
+
278
+ stat_vals |= {"colname": prof.colname}
279
+ stat_vals |= {"coltype": str(prof.coltype)}
280
+ stat_vals |= {"sample_data": str(prof.sample_data)} # TODO: not a good way to do this
281
+ stat_vals |= {"icon": _TypeMap.fetch_icon(prof._type)}
282
+ cols.append(stat_vals)
283
+
284
+ # Stringify if type mismatch
285
+ # Get all unique keys across all dictionaries
286
+ all_keys = set().union(*(d.keys() for d in cols))
287
+
288
+ for key in all_keys:
289
+ # Get all values for this key across all dictionaries
290
+ values = [d.get(key) for d in cols if key in d]
291
+
292
+ # Check if all values are of the same type
293
+ if len(values) > 1:
294
+ first_type = type(values[0])
295
+
296
+ # use `type` instead of instance check because some types are sub
297
+ # classes of supers, ie. date is a subclass of datetime, so it's
298
+ # technically an instance. This however would fail most dataframe
299
+ # instantiations that require consistent types.
300
+ all_same_type: bool = all(type(v) is first_type for v in values[1:])
301
+ if not all_same_type:
302
+ if strict:
303
+ msg = f"Some types in {key!s} stat are different. Turn off `strict` to bypass."
304
+ raise TypeError(msg)
305
+ for d in cols:
306
+ if key in d:
307
+ d[key] = str(d[key])
308
+
309
+ return nw.from_dict(transpose_dicts(cols), backend=self.implementation)
310
+
311
+ def __repr__(self) -> str: # pragma: no cover
312
+ return f"<_DataProfile(table_name={self.table_name}, row_count={self.row_count}, columns={self.columns})>"
313
+
314
+
315
+ def _as_physical(data: Frame) -> DataFrame:
316
+ try:
317
+ # TODO: might be a built in way to do this
318
+ return data.collect() # type: ignore[union-attr]
319
+ except AttributeError:
320
+ assert isinstance(data, DataFrame) # help mypy
321
+ return data
@@ -0,0 +1,180 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC
4
+ from dataclasses import dataclass
5
+ from enum import Enum, auto
6
+ from typing import TYPE_CHECKING, ClassVar
7
+
8
+ import narwhals as nw
9
+
10
+ from pointblank._utils_html import _make_sublabel
11
+
12
+ if TYPE_CHECKING:
13
+ from typing import Any
14
+
15
+
16
+ class StatGroup(Enum):
17
+ DESCR = auto()
18
+ SUMMARY = auto()
19
+ STRUCTURE = auto()
20
+ LOGIC = auto()
21
+ IQR = auto()
22
+ FREQ = auto()
23
+ BOUNDS = auto()
24
+
25
+
26
+ # TODO: Make sure all these subclasses are suffixed w/`Stat`
27
+ # TODO: Replace all the nw.all w/_col
28
+
29
+
30
+ class Stat(ABC):
31
+ val: Any
32
+ name: ClassVar[str]
33
+ group: ClassVar[StatGroup]
34
+ expr: ClassVar[nw.Expr]
35
+ label: ClassVar[str]
36
+
37
+ def __eq__(self, value) -> bool:
38
+ if isinstance(value, str):
39
+ return value == self.name
40
+ if isinstance(value, Stat):
41
+ return value is self
42
+ return NotImplemented
43
+
44
+ @classmethod
45
+ def _fetch_priv_name(self) -> str:
46
+ return f"_{self.name}"
47
+
48
+
49
+ @dataclass(frozen=True)
50
+ class MeanStat(Stat):
51
+ val: str
52
+ name: ClassVar[str] = "mean"
53
+ group = StatGroup.SUMMARY
54
+ expr: ClassVar[nw.Expr] = nw.col("_col").mean()
55
+ label: ClassVar[str] = "Mean"
56
+
57
+
58
+ @dataclass(frozen=True)
59
+ class StdStat(Stat): # TODO: Rename this SD for consistency
60
+ val: str
61
+ name: ClassVar[str] = "std"
62
+ group = StatGroup.SUMMARY
63
+ expr: ClassVar[nw.Expr] = nw.col("_col").std()
64
+ label: ClassVar[str] = "SD"
65
+
66
+
67
+ @dataclass(frozen=True)
68
+ class MinStat(Stat):
69
+ val: str
70
+ name: ClassVar[str] = "min"
71
+ group = StatGroup.BOUNDS # TODO: These should get put back in DESCR once datetime p*
72
+ expr: ClassVar[nw.Expr] = nw.col("_col").min() # don't cast as float, can be date
73
+ label: ClassVar[str] = "Min"
74
+
75
+
76
+ @dataclass(frozen=True)
77
+ class MaxStat(Stat):
78
+ val: str
79
+ name: ClassVar[str] = "max"
80
+ group = StatGroup.BOUNDS # TODO: These should get put back in DESCR once datetime p*
81
+ expr: ClassVar[nw.Expr] = nw.col("_col").max() # don't cast as float, can be date
82
+ label: ClassVar[str] = "Max"
83
+
84
+
85
+ @dataclass(frozen=True)
86
+ class P05Stat(Stat):
87
+ val: str
88
+ name: ClassVar[str] = "p05"
89
+ group = StatGroup.DESCR
90
+ expr: ClassVar[nw.Expr] = nw.col("_col").quantile(0.005, interpolation="linear")
91
+ label: ClassVar[str] = _make_sublabel("P", "5")
92
+
93
+
94
+ @dataclass(frozen=True)
95
+ class Q1Stat(Stat):
96
+ val: str
97
+ name: ClassVar[str] = "q_1"
98
+ group = StatGroup.DESCR
99
+ expr: ClassVar[nw.Expr] = nw.col("_col").quantile(0.25, interpolation="linear")
100
+ label: ClassVar[str] = _make_sublabel("Q", "1")
101
+
102
+
103
+ @dataclass(frozen=True)
104
+ class MedianStat(Stat):
105
+ val: str
106
+ name: ClassVar[str] = "median"
107
+ group = StatGroup.DESCR
108
+ expr: ClassVar[nw.Expr] = nw.col("_col").median()
109
+ label: ClassVar[str] = "Med"
110
+
111
+
112
+ @dataclass(frozen=True)
113
+ class Q3Stat(Stat):
114
+ val: str
115
+ name: ClassVar[str] = "q_3"
116
+ group = StatGroup.DESCR
117
+ expr: ClassVar[nw.Expr] = nw.col("_col").quantile(0.75, interpolation="linear")
118
+ label: ClassVar[str] = _make_sublabel("Q", "3")
119
+
120
+
121
+ @dataclass(frozen=True)
122
+ class P95Stat(Stat):
123
+ val: str
124
+ name: ClassVar[str] = "p95"
125
+ group = StatGroup.DESCR
126
+ expr: ClassVar[nw.Expr] = nw.col("_col").quantile(0.95, interpolation="linear")
127
+ label: ClassVar[str] = _make_sublabel("P", "95")
128
+
129
+
130
+ @dataclass(frozen=True)
131
+ class IQRStat(Stat):
132
+ val: str
133
+ name: ClassVar[str] = "iqr"
134
+ group = StatGroup.IQR
135
+ expr: ClassVar[nw.Expr] = nw.col(Q3Stat._fetch_priv_name()) - nw.col(Q1Stat._fetch_priv_name())
136
+ label: ClassVar[str] = "IQR"
137
+
138
+
139
+ @dataclass(frozen=True)
140
+ class FreqStat(Stat):
141
+ val: dict[str, int] # the key must be stringified
142
+ name: ClassVar[str] = "freqs"
143
+ group = StatGroup.FREQ
144
+ expr: ClassVar[nw.Expr] = nw.len()
145
+ label: ClassVar[str] = "Freq"
146
+
147
+
148
+ @dataclass(frozen=True)
149
+ class NMissing(Stat):
150
+ val: int
151
+ name: ClassVar[str] = "n_missing"
152
+ group = StatGroup.STRUCTURE
153
+ expr: ClassVar[nw.Expr] = nw.col("_col").null_count().cast(nw.Int64)
154
+ label: ClassVar[str] = "NA"
155
+
156
+
157
+ @dataclass(frozen=True)
158
+ class NUnique(Stat):
159
+ val: int
160
+ name: ClassVar[str] = "n_unique"
161
+ group = StatGroup.STRUCTURE
162
+ expr: ClassVar[nw.Expr] = nw.col("_col").n_unique().cast(nw.Int64)
163
+ label: ClassVar[str] = "UQ"
164
+
165
+
166
+ COLUMN_ORDER_REGISTRY: tuple[type[Stat], ...] = (
167
+ NMissing,
168
+ NUnique,
169
+ MeanStat,
170
+ StdStat,
171
+ MinStat,
172
+ P05Stat,
173
+ Q1Stat,
174
+ MedianStat,
175
+ Q3Stat,
176
+ P95Stat,
177
+ MaxStat,
178
+ FreqStat,
179
+ IQRStat,
180
+ )
pointblank/schema.py CHANGED
@@ -4,7 +4,7 @@ import copy
4
4
  from dataclasses import dataclass
5
5
 
6
6
  from pointblank._constants import IBIS_BACKENDS
7
- from pointblank._utils import _get_tbl_type, _is_lib_present
7
+ from pointblank._utils import _get_tbl_type, _is_lazy_frame, _is_lib_present, _is_narwhals_table
8
8
 
9
9
  __all__ = ["Schema"]
10
10
 
@@ -315,13 +315,24 @@ class Schema:
315
315
  table_type = _get_tbl_type(self.tbl)
316
316
 
317
317
  # Collect column names and dtypes from the DataFrame and store as a list of tuples
318
- if table_type == "pandas":
318
+ if _is_narwhals_table(self.tbl):
319
+ if _is_lazy_frame(data=self.tbl):
320
+ schema_dict = dict(self.tbl.collect_schema())
321
+ else:
322
+ schema_dict = dict(self.tbl.schema.items())
323
+ schema_dict = {k: str(v) for k, v in schema_dict.items()}
324
+ self.columns = list(schema_dict.items())
325
+
326
+ elif table_type == "pandas":
319
327
  schema_dict = dict(self.tbl.dtypes)
320
328
  schema_dict = {k: str(v) for k, v in schema_dict.items()}
321
329
  self.columns = list(schema_dict.items())
322
330
 
323
331
  elif table_type == "polars":
324
- schema_dict = dict(self.tbl.schema.items())
332
+ if _is_lazy_frame(data=self.tbl):
333
+ schema_dict = dict(self.tbl.collect_schema())
334
+ else:
335
+ schema_dict = dict(self.tbl.schema.items())
325
336
  schema_dict = {k: str(v) for k, v in schema_dict.items()}
326
337
  self.columns = list(schema_dict.items())
327
338