pointblank 0.9.5__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pointblank/datascan.py CHANGED
@@ -1,24 +1,31 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import contextlib
3
4
  import json
4
- from dataclasses import dataclass, field
5
5
  from importlib.metadata import version
6
- from math import floor, log10
7
- from typing import Any
6
+ from typing import TYPE_CHECKING, Any
8
7
 
9
8
  import narwhals as nw
10
9
  from great_tables import GT, google_font, html, loc, style
11
- from great_tables.vals import fmt_integer, fmt_number, fmt_scientific
10
+ from narwhals.dataframe import LazyFrame
12
11
  from narwhals.typing import FrameT
13
12
 
14
- from pointblank._constants import SVG_ICONS_FOR_DATA_TYPES
15
- from pointblank._utils import _get_tbl_type, _select_df_lib
16
- from pointblank._utils_html import _create_table_dims_html, _create_table_type_html
13
+ from pointblank._utils_html import _create_table_dims_html, _create_table_type_html, _fmt_frac
14
+ from pointblank.scan_profile import ColumnProfile, _as_physical, _DataProfile, _TypeMap
15
+ from pointblank.scan_profile_stats import COLUMN_ORDER_REGISTRY
16
+
17
+ if TYPE_CHECKING:
18
+ from collections.abc import Mapping, Sequence
19
+
20
+ from narwhals.dataframe import DataFrame
21
+ from narwhals.typing import Frame, IntoFrameT
22
+
23
+ from pointblank.scan_profile_stats import StatGroup
24
+
17
25
 
18
26
  __all__ = ["DataScan", "col_summary_tbl"]
19
27
 
20
28
 
21
- @dataclass
22
29
  class DataScan:
23
30
  """
24
31
  Get a summary of a dataset.
@@ -113,565 +120,92 @@ class DataScan:
113
120
  A DataScan object.
114
121
  """
115
122
 
116
- data: FrameT | Any
117
- tbl_name: str | None = None
118
- data_alt: Any | None = field(init=False)
119
- tbl_category: str = field(init=False)
120
- tbl_type: str = field(init=False)
121
- profile: dict = field(init=False)
122
-
123
- def __post_init__(self):
124
- # Determine if the data is a DataFrame that could be handled by Narwhals,
125
- # or an Ibis Table
126
- self.tbl_type = _get_tbl_type(data=self.data)
127
- ibis_tbl = "ibis.expr.types.relations.Table" in str(type(self.data))
128
- pl_pd_tbl = "polars" in self.tbl_type or "pandas" in self.tbl_type
129
-
130
- # Set the table category based on the type of table (this will be used to determine
131
- # how to handle the data)
132
- if ibis_tbl:
133
- self.tbl_category = "ibis"
134
- else:
135
- self.tbl_category = "dataframe"
136
-
137
- # If the data is DataFrame, convert it to a Narwhals DataFrame
138
- if pl_pd_tbl:
139
- self.data_alt = nw.from_native(self.data)
140
- else:
141
- self.data_alt = None
142
-
143
- # Generate the profile based on the `tbl_category` value
144
- if self.tbl_category == "dataframe":
145
- self.profile = self._generate_profile_df()
146
-
147
- if self.tbl_category == "ibis":
148
- self.profile = self._generate_profile_ibis()
149
-
150
- def _generate_profile_df(self) -> dict:
151
- profile = {}
152
-
153
- if self.tbl_name:
154
- profile["tbl_name"] = self.tbl_name
155
-
156
- row_count = self.data_alt.shape[0]
157
- column_count = self.data_alt.shape[1]
158
-
159
- profile.update(
160
- {
161
- "tbl_type": self.tbl_type,
162
- "dimensions": {"rows": row_count, "columns": column_count},
163
- "columns": [],
164
- }
165
- )
166
-
167
- for idx, column in enumerate(self.data_alt.columns):
168
- col_data = self.data_alt[column]
169
- native_dtype = str(self.data[column].dtype)
170
-
171
- #
172
- # Collection of sample data
173
- #
174
- if "date" in str(col_data.dtype).lower():
175
- sample_data = col_data.drop_nulls().head(5).cast(nw.String).to_list()
176
- sample_data = [str(x) for x in sample_data]
177
- else:
178
- sample_data = col_data.drop_nulls().head(5).to_list()
179
-
180
- n_missing_vals = int(col_data.is_null().sum())
181
- n_unique_vals = int(col_data.n_unique())
182
-
183
- # If there are missing values, subtract 1 from the number of unique values
184
- # to account for the missing value which shouldn't be included in the count
185
- if (n_missing_vals > 0) and (n_unique_vals > 0):
186
- n_unique_vals = n_unique_vals - 1
187
-
188
- f_missing_vals = _round_to_sig_figs(n_missing_vals / row_count, 3)
189
- f_unique_vals = _round_to_sig_figs(n_unique_vals / row_count, 3)
190
-
191
- col_profile = {
192
- "column_name": column,
193
- "column_type": native_dtype,
194
- "column_number": idx + 1,
195
- "n_missing_values": n_missing_vals,
196
- "f_missing_values": f_missing_vals,
197
- "n_unique_values": n_unique_vals,
198
- "f_unique_values": f_unique_vals,
199
- }
200
-
201
- #
202
- # Numerical columns
203
- #
204
- if "int" in str(col_data.dtype).lower() or "float" in str(col_data.dtype).lower():
205
- n_negative_vals = int(col_data.is_between(-1e26, -1e-26).sum())
206
- f_negative_vals = _round_to_sig_figs(n_negative_vals / row_count, 3)
207
-
208
- n_zero_vals = int(col_data.is_between(0, 0).sum())
209
- f_zero_vals = _round_to_sig_figs(n_zero_vals / row_count, 3)
210
-
211
- n_positive_vals = row_count - n_missing_vals - n_negative_vals - n_zero_vals
212
- f_positive_vals = _round_to_sig_figs(n_positive_vals / row_count, 3)
213
-
214
- col_profile_additional = {
215
- "n_negative_values": n_negative_vals,
216
- "f_negative_values": f_negative_vals,
217
- "n_zero_values": n_zero_vals,
218
- "f_zero_values": f_zero_vals,
219
- "n_positive_values": n_positive_vals,
220
- "f_positive_values": f_positive_vals,
221
- "sample_data": sample_data,
222
- }
223
- col_profile.update(col_profile_additional)
224
-
225
- col_profile_stats = {
226
- "statistics": {
227
- "numerical": {
228
- "descriptive": {
229
- "mean": round(float(col_data.mean()), 2),
230
- "std_dev": round(float(col_data.std()), 4),
231
- },
232
- "quantiles": {
233
- "min": float(col_data.min()),
234
- "p05": round(
235
- float(col_data.quantile(0.05, interpolation="linear")), 2
236
- ),
237
- "q_1": round(
238
- float(col_data.quantile(0.25, interpolation="linear")), 2
239
- ),
240
- "med": float(col_data.median()),
241
- "q_3": round(
242
- float(col_data.quantile(0.75, interpolation="linear")), 2
243
- ),
244
- "p95": round(
245
- float(col_data.quantile(0.95, interpolation="linear")), 2
246
- ),
247
- "max": float(col_data.max()),
248
- "iqr": round(
249
- float(col_data.quantile(0.75, interpolation="linear"))
250
- - float(col_data.quantile(0.25, interpolation="linear")),
251
- 2,
252
- ),
253
- },
254
- }
255
- }
256
- }
257
- col_profile.update(col_profile_stats)
258
-
259
- #
260
- # String columns
261
- #
262
- elif (
263
- "string" in str(col_data.dtype).lower()
264
- or "categorical" in str(col_data.dtype).lower()
265
- ):
266
- col_profile_additional = {
267
- "sample_data": sample_data,
268
- }
269
- col_profile.update(col_profile_additional)
270
-
271
- # Transform `col_data` to a column of string lengths
272
- col_str_len_data = col_data.str.len_chars()
273
-
274
- col_profile_stats = {
275
- "statistics": {
276
- "string_lengths": {
277
- "descriptive": {
278
- "mean": round(float(col_str_len_data.mean()), 2),
279
- "std_dev": round(float(col_str_len_data.std()), 4),
280
- },
281
- "quantiles": {
282
- "min": int(col_str_len_data.min()),
283
- "p05": int(col_str_len_data.quantile(0.05, interpolation="linear")),
284
- "q_1": int(col_str_len_data.quantile(0.25, interpolation="linear")),
285
- "med": int(col_str_len_data.median()),
286
- "q_3": int(col_str_len_data.quantile(0.75, interpolation="linear")),
287
- "p95": int(col_str_len_data.quantile(0.95, interpolation="linear")),
288
- "max": int(col_str_len_data.max()),
289
- "iqr": int(col_str_len_data.quantile(0.75, interpolation="linear"))
290
- - int(col_str_len_data.quantile(0.25, interpolation="linear")),
291
- },
292
- }
293
- }
294
- }
295
- col_profile.update(col_profile_stats)
296
-
297
- #
298
- # Date and datetime columns
299
- #
300
- elif "date" in str(col_data.dtype).lower():
301
- col_profile_additional = {
302
- "sample_data": sample_data,
303
- }
304
- col_profile.update(col_profile_additional)
305
-
306
- min_date = str(col_data.min())
307
- max_date = str(col_data.max())
308
-
309
- col_profile_stats = {
310
- "statistics": {
311
- "datetime": {
312
- "min": min_date,
313
- "max": max_date,
314
- }
315
- }
316
- }
317
- col_profile.update(col_profile_stats)
318
-
319
- #
320
- # Boolean columns
321
- #
322
- elif "bool" in str(col_data.dtype).lower():
323
- col_profile_additional = {
324
- "sample_data": sample_data,
325
- }
326
- col_profile.update(col_profile_additional)
327
-
328
- n_true_values = int(col_data.sum())
329
- f_true_values = _round_to_sig_figs(n_true_values / row_count, 3)
330
-
331
- n_false_values = row_count - n_missing_vals - n_true_values
332
- f_false_values = _round_to_sig_figs(n_false_values / row_count, 3)
333
-
334
- col_profile_stats = {
335
- "statistics": {
336
- "boolean": {
337
- "n_true_values": n_true_values,
338
- "f_true_values": f_true_values,
339
- "n_false_values": n_false_values,
340
- "f_false_values": f_false_values,
341
- }
342
- }
343
- }
344
- col_profile.update(col_profile_stats)
345
-
346
- profile["columns"].append(col_profile)
347
-
348
- return profile
349
-
350
- def _generate_profile_ibis(self) -> dict:
351
- profile = {}
123
+ # TODO: This needs to be generically typed at the class level, ie. DataScan[T]
124
+ def __init__(self, data: IntoFrameT, tbl_name: str | None = None) -> None:
125
+ as_native = nw.from_native(data)
352
126
 
353
- if self.tbl_name:
354
- profile["tbl_name"] = self.tbl_name
127
+ if as_native.implementation.name == "IBIS" and as_native._level == "lazy":
128
+ assert isinstance(as_native, LazyFrame) # help mypy
355
129
 
356
- from pointblank.validate import get_row_count
130
+ ibis_native = as_native.to_native()
357
131
 
358
- row_count = get_row_count(data=self.data)
359
- column_count = len(self.data.columns)
360
-
361
- profile.update(
362
- {
363
- "tbl_type": self.tbl_type,
364
- "dimensions": {"rows": row_count, "columns": column_count},
365
- "columns": [],
366
- }
367
- )
368
-
369
- # Determine which DataFrame library is available
370
- df_lib = _select_df_lib(preference="polars")
371
- df_lib_str = str(df_lib)
372
-
373
- if "polars" in df_lib_str:
374
- df_lib_use = "polars"
375
- else:
376
- df_lib_use = "pandas"
377
-
378
- column_dtypes = list(self.data.schema().items())
379
-
380
- for idx, column in enumerate(self.data.columns):
381
- dtype_str = str(column_dtypes[idx][1])
382
-
383
- col_data = self.data[column]
384
- col_data_no_null = self.data.drop_null().head(5)[column]
385
-
386
- #
387
- # Collection of sample data
388
- #
389
- if "date" in dtype_str.lower() or "timestamp" in dtype_str.lower():
390
- if df_lib_use == "polars":
391
- import polars as pl
392
-
393
- sample_data = col_data_no_null.to_polars().cast(pl.String).to_list()
394
- else:
395
- sample_data = col_data_no_null.to_pandas().astype(str).to_list()
132
+ valid_conversion_methods = ("to_pyarrow", "to_pandas", "to_polars")
133
+ for conv_method in valid_conversion_methods:
134
+ try:
135
+ valid_native = getattr(ibis_native, conv_method)()
136
+ except (NotImplementedError, ImportError, ModuleNotFoundError):
137
+ continue
138
+ break
396
139
  else:
397
- if df_lib_use == "polars":
398
- sample_data = col_data_no_null.to_polars().to_list()
399
- else:
400
- sample_data = col_data_no_null.to_pandas().to_list()
401
-
402
- n_missing_vals = int(_to_df_lib(col_data.isnull().sum(), df_lib=df_lib_use))
403
- n_unique_vals = int(_to_df_lib(col_data.nunique(), df_lib=df_lib_use))
404
-
405
- # If there are missing values, subtract 1 from the number of unique values
406
- # to account for the missing value which shouldn't be included in the count
407
- if (n_missing_vals > 0) and (n_unique_vals > 0):
408
- n_unique_vals = n_unique_vals - 1
409
-
410
- f_missing_vals = _round_to_sig_figs(n_missing_vals / row_count, 3)
411
- f_unique_vals = _round_to_sig_figs(n_unique_vals / row_count, 3)
412
-
413
- col_profile = {
414
- "column_name": column,
415
- "column_type": dtype_str,
416
- "column_number": idx + 1,
417
- "n_missing_values": n_missing_vals,
418
- "f_missing_values": f_missing_vals,
419
- "n_unique_values": n_unique_vals,
420
- "f_unique_values": f_unique_vals,
421
- }
422
-
423
- #
424
- # Numerical columns
425
- #
426
- if "int" in dtype_str.lower() or "float" in dtype_str.lower():
427
- n_negative_vals = int(
428
- _to_df_lib(col_data.between(-1e26, -1e-26).sum(), df_lib=df_lib_use)
140
+ msg = (
141
+ "To use `ibis` as input, you must have one of arrow, pandas, polars or numpy "
142
+ "available in the process. Until `ibis` is fully supported by Narwhals, this is "
143
+ "necessary. Additionally, the data must be collected in order to calculate some "
144
+ "structural statistics, which may be performance detrimental."
429
145
  )
430
- f_negative_vals = _round_to_sig_figs(n_negative_vals / row_count, 3)
431
-
432
- n_zero_vals = int(_to_df_lib(col_data.between(0, 0).sum(), df_lib=df_lib_use))
433
- f_zero_vals = _round_to_sig_figs(n_zero_vals / row_count, 3)
434
-
435
- n_positive_vals = row_count - n_missing_vals - n_negative_vals - n_zero_vals
436
- f_positive_vals = _round_to_sig_figs(n_positive_vals / row_count, 3)
437
-
438
- col_profile_additional = {
439
- "n_negative_values": n_negative_vals,
440
- "f_negative_values": f_negative_vals,
441
- "n_zero_values": n_zero_vals,
442
- "f_zero_values": f_zero_vals,
443
- "n_positive_values": n_positive_vals,
444
- "f_positive_values": f_positive_vals,
445
- "sample_data": sample_data,
446
- }
447
- col_profile.update(col_profile_additional)
448
-
449
- col_profile_stats = {
450
- "statistics": {
451
- "numerical": {
452
- "descriptive": {
453
- "mean": round(_to_df_lib(col_data.mean(), df_lib=df_lib_use), 2),
454
- "std_dev": round(_to_df_lib(col_data.std(), df_lib=df_lib_use), 4),
455
- },
456
- "quantiles": {
457
- "min": _to_df_lib(col_data.min(), df_lib=df_lib_use),
458
- "p05": round(
459
- _to_df_lib(col_data.approx_quantile(0.05), df_lib=df_lib_use),
460
- 2,
461
- ),
462
- "q_1": round(
463
- _to_df_lib(col_data.approx_quantile(0.25), df_lib=df_lib_use),
464
- 2,
465
- ),
466
- "med": _to_df_lib(col_data.median(), df_lib=df_lib_use),
467
- "q_3": round(
468
- _to_df_lib(col_data.approx_quantile(0.75), df_lib=df_lib_use),
469
- 2,
470
- ),
471
- "p95": round(
472
- _to_df_lib(col_data.approx_quantile(0.95), df_lib=df_lib_use),
473
- 2,
474
- ),
475
- "max": _to_df_lib(col_data.max(), df_lib=df_lib_use),
476
- "iqr": round(
477
- _to_df_lib(col_data.quantile(0.75), df_lib=df_lib_use)
478
- - _to_df_lib(col_data.quantile(0.25), df_lib=df_lib_use),
479
- 2,
480
- ),
481
- },
482
- }
483
- }
484
- }
485
- col_profile.update(col_profile_stats)
486
-
487
- #
488
- # String columns
489
- #
490
- elif "string" in dtype_str.lower() or "char" in dtype_str.lower():
491
- col_profile_additional = {
492
- "sample_data": sample_data,
493
- }
494
- col_profile.update(col_profile_additional)
495
-
496
- # Transform `col_data` to a column of string lengths
497
- col_str_len_data = col_data.length()
498
-
499
- col_profile_stats = {
500
- "statistics": {
501
- "string_lengths": {
502
- "descriptive": {
503
- "mean": round(
504
- float(_to_df_lib(col_str_len_data.mean(), df_lib=df_lib_use)), 2
505
- ),
506
- "std_dev": round(
507
- float(_to_df_lib(col_str_len_data.std(), df_lib=df_lib_use)), 4
508
- ),
509
- },
510
- "quantiles": {
511
- "min": int(_to_df_lib(col_str_len_data.min(), df_lib=df_lib_use)),
512
- "p05": int(
513
- _to_df_lib(
514
- col_str_len_data.approx_quantile(0.05),
515
- df_lib=df_lib_use,
516
- )
517
- ),
518
- "q_1": int(
519
- _to_df_lib(
520
- col_str_len_data.approx_quantile(0.25),
521
- df_lib=df_lib_use,
522
- )
523
- ),
524
- "med": int(
525
- _to_df_lib(col_str_len_data.median(), df_lib=df_lib_use)
526
- ),
527
- "q_3": int(
528
- _to_df_lib(
529
- col_str_len_data.approx_quantile(0.75),
530
- df_lib=df_lib_use,
531
- )
532
- ),
533
- "p95": int(
534
- _to_df_lib(
535
- col_str_len_data.approx_quantile(0.95),
536
- df_lib=df_lib_use,
537
- )
538
- ),
539
- "max": int(_to_df_lib(col_str_len_data.max(), df_lib=df_lib_use)),
540
- "iqr": int(
541
- _to_df_lib(
542
- col_str_len_data.approx_quantile(0.75),
543
- df_lib=df_lib_use,
544
- )
545
- )
546
- - int(
547
- _to_df_lib(
548
- col_str_len_data.approx_quantile(0.25),
549
- df_lib=df_lib_use,
550
- )
551
- ),
552
- },
553
- }
554
- }
555
- }
556
- col_profile.update(col_profile_stats)
557
-
558
- #
559
- # Date and datetime columns
560
- #
561
- elif "date" in dtype_str.lower() or "timestamp" in dtype_str.lower():
562
- col_profile_additional = {
563
- "sample_data": sample_data,
564
- }
565
- col_profile.update(col_profile_additional)
566
-
567
- min_date = _to_df_lib(col_data.min(), df_lib=df_lib_use)
568
- max_date = _to_df_lib(col_data.max(), df_lib=df_lib_use)
569
-
570
- col_profile_stats = {
571
- "statistics": {
572
- "datetime": {
573
- "min": str(min_date),
574
- "max": str(max_date),
575
- }
576
- }
577
- }
578
- col_profile.update(col_profile_stats)
579
-
580
- #
581
- # Boolean columns
582
- #
583
- elif "bool" in dtype_str.lower():
584
- col_profile_additional = {
585
- "sample_data": sample_data,
586
- }
587
- col_profile.update(col_profile_additional)
588
-
589
- n_true_values = _to_df_lib(col_data.cast(int).sum(), df_lib=df_lib)
590
- f_true_values = _round_to_sig_figs(n_true_values / row_count, 3)
591
-
592
- n_false_values = row_count - n_missing_vals - n_true_values
593
- f_false_values = _round_to_sig_figs(n_false_values / row_count, 3)
594
-
595
- col_profile_stats = {
596
- "statistics": {
597
- "boolean": {
598
- "n_true_values": n_true_values,
599
- "f_true_values": f_true_values,
600
- "n_false_values": n_false_values,
601
- "f_false_values": f_false_values,
602
- }
603
- }
604
- }
605
- col_profile.update(col_profile_stats)
606
-
607
- profile["columns"].append(col_profile)
608
-
609
- return profile
610
-
611
- def get_tabular_report(self) -> GT:
612
- column_data = self.profile["columns"]
613
-
614
- tbl_name = self.tbl_name
146
+ raise ImportError(msg)
147
+ as_native = nw.from_native(valid_native)
615
148
 
616
- stats_list = []
617
- datetime_row_list = []
149
+ self.nw_data: Frame = nw.from_native(as_native)
618
150
 
619
- n_rows = self.profile["dimensions"]["rows"]
620
- n_columns = self.profile["dimensions"]["columns"]
151
+ self.tbl_name: str | None = tbl_name
152
+ self.profile: _DataProfile = self._generate_profile_df()
621
153
 
622
- # Iterate over each column's data and obtain a dictionary of statistics for each column
623
- for idx, col in enumerate(column_data):
624
- if "statistics" in col and "numerical" in col["statistics"]:
625
- col_dict = _process_numerical_column_data(col)
626
- elif "statistics" in col and "string_lengths" in col["statistics"]:
627
- col_dict = _process_string_column_data(col)
628
- elif "statistics" in col and "datetime" in col["statistics"]:
629
- col_dict = _process_datetime_column_data(col)
630
- datetime_row_list.append(idx)
631
- elif "statistics" in col and "boolean" in col["statistics"]:
632
- col_dict = _process_boolean_column_data(col)
633
- else:
634
- col_dict = _process_other_column_data(col)
154
+ def _generate_profile_df(self) -> _DataProfile:
155
+ columns: list[str] = self.nw_data.columns
635
156
 
636
- stats_list.append(col_dict)
157
+ profile = _DataProfile(
158
+ table_name=self.tbl_name,
159
+ columns=columns,
160
+ implementation=self.nw_data.implementation,
161
+ )
162
+ schema: Mapping[str, Any] = self.nw_data.schema
163
+ for column in columns:
164
+ col_data: DataFrame = self.nw_data.select(column)
165
+
166
+ ## Handle dtyping:
167
+ native_dtype = schema[column]
168
+ if _TypeMap.is_illegal(native_dtype):
169
+ continue
170
+ try:
171
+ prof: type[ColumnProfile] = _TypeMap.fetch_profile(native_dtype)
172
+ except NotImplementedError:
173
+ continue
174
+
175
+ col_profile = ColumnProfile(colname=column, coltype=native_dtype)
176
+
177
+ ## Collect Sample Data:
178
+ ## This is the most consistent way (i think) to get the samples out of the data.
179
+ ## We can avoid writing our own logic to determine operations and rely on narwhals.
180
+ raw_vals: list[Any] = (
181
+ _as_physical(col_data.drop_nulls().head(5)).to_dict()[column].to_list()
182
+ )
183
+ col_profile.sample_data = [str(x) for x in raw_vals]
637
184
 
638
- # Determine which DataFrame library is available and construct the DataFrame
639
- # based on the available library
640
- df_lib = _select_df_lib(preference="polars")
641
- df_lib_str = str(df_lib)
185
+ col_profile.calc_stats(col_data)
642
186
 
643
- if "polars" in df_lib_str:
644
- import polars as pl
187
+ sub_profile: ColumnProfile = col_profile.spawn_profile(prof)
188
+ sub_profile.calc_stats(col_data)
645
189
 
646
- stats_df = pl.DataFrame(stats_list)
647
- else:
648
- import pandas as pd
190
+ profile.column_profiles.append(sub_profile)
649
191
 
650
- stats_df = pd.DataFrame(stats_list)
192
+ profile.set_row_count(self.nw_data)
651
193
 
652
- stats_df = pl.DataFrame(stats_list)
194
+ return profile
653
195
 
654
- stat_columns = [
655
- "missing_vals",
656
- "unique_vals",
657
- "mean",
658
- "std_dev",
659
- "min",
660
- "p05",
661
- "q_1",
662
- "med",
663
- "q_3",
664
- "p95",
665
- "max",
666
- "iqr",
667
- ]
196
+ @property
197
+ def summary_data(self) -> IntoFrameT:
198
+ return self.profile.as_dataframe(strict=False).to_native()
668
199
 
200
+ def get_tabular_report(self, *, show_sample_data: bool = False) -> GT:
669
201
  # Create the label, table type, and thresholds HTML fragments
670
202
  table_type_html = _create_table_type_html(
671
- tbl_type=self.tbl_type, tbl_name=tbl_name, font_size="10px"
203
+ tbl_type=str(self.profile.implementation), tbl_name=self.tbl_name, font_size="10px"
672
204
  )
673
205
 
674
- tbl_dims_html = _create_table_dims_html(columns=n_columns, rows=n_rows, font_size="10px")
206
+ tbl_dims_html = _create_table_dims_html(
207
+ columns=len(self.profile.columns), rows=self.profile.row_count, font_size="10px"
208
+ )
675
209
 
676
210
  # Compose the subtitle HTML fragment
677
211
  combined_title = (
@@ -685,113 +219,272 @@ class DataScan:
685
219
 
686
220
  # TODO: Ensure width is 905px in total
687
221
 
688
- gt_tbl = (
689
- GT(stats_df, id="col_summary")
690
- .tab_header(title=html(combined_title))
691
- .cols_align(align="right", columns=stat_columns)
692
- .opt_table_font(font=google_font("IBM Plex Sans"))
693
- .opt_align_table_header(align="left")
694
- .tab_style(
695
- style=style.text(font=google_font("IBM Plex Mono")),
696
- locations=loc.body(),
222
+ data: DataFrame = self.profile.as_dataframe(strict=False)
223
+
224
+ ## Remove all null columns:
225
+ all_null: list[str] = []
226
+ for stat_name in data.iter_columns():
227
+ col_len = len(stat_name.drop_nulls())
228
+ if col_len == 0:
229
+ all_null.append(stat_name.name)
230
+ data = data.drop(all_null)
231
+
232
+ if not show_sample_data:
233
+ data = data.drop("sample_data")
234
+
235
+ # find what stat cols were used in the analysis
236
+ non_stat_cols = ("icon", "colname") # TODO: need a better place for this
237
+ present_stat_cols: set[str] = set(data.columns) - set(non_stat_cols)
238
+ present_stat_cols.remove("coltype")
239
+ with contextlib.suppress(KeyError):
240
+ present_stat_cols.remove("freqs") # TODO: currently used for html but no displayed?
241
+
242
+ ## Assemble the target order and find what columns need borders.
243
+ ## Borders should be placed to divide the stat "groups" and create a
244
+ ## generally more aesthetically pleasing experience.
245
+ target_order: list[str] = list(non_stat_cols)
246
+ right_border_cols: list[str] = [non_stat_cols[-1]]
247
+
248
+ last_group: StatGroup = COLUMN_ORDER_REGISTRY[0].group
249
+ for col in COLUMN_ORDER_REGISTRY:
250
+ if col.name in present_stat_cols:
251
+ cur_group: StatGroup = col.group
252
+ target_order.append(col.name)
253
+
254
+ start_new_group: bool = last_group != cur_group
255
+ if start_new_group:
256
+ last_group = cur_group
257
+ last_col_added = target_order[-2] # -2 since we don't include the current
258
+ right_border_cols.append(last_col_added)
259
+
260
+ right_border_cols.append(target_order[-1]) # add border to last stat col
261
+
262
+ label_map: dict[str, Any] = self._build_label_map(target_order)
263
+
264
+ ## Final Formatting:
265
+ formatted_data = data.with_columns(
266
+ colname=nw.concat_str(
267
+ nw.lit(
268
+ "<div style='font-size: 13px; white-space: nowrap; text-overflow: ellipsis; overflow: hidden;'>"
269
+ ),
270
+ nw.col("colname"),
271
+ nw.lit("</div><div style='font-size: 11px; color: gray;'>"),
272
+ nw.col("coltype"),
273
+ nw.lit("</div>"),
274
+ ),
275
+ __frac_n_unique=nw.col("n_unique") / nw.lit(self.profile.row_count),
276
+ __frac_n_missing=nw.col("n_missing") / nw.lit(self.profile.row_count),
277
+ )
278
+
279
+ ## Pull out type indicies:
280
+ # TODO: The stat types should get an enum? or something?
281
+ # TODO: This all assumes the dates are separated by dashes, is that even true?
282
+ # TODO: This all assumes date_stats are strings already, not ints or anything else.
283
+ any_dates: bool = formatted_data.select(
284
+ __tmp_idx=nw.col("coltype").str.contains("Date", literal=True)
285
+ )["__tmp_idx"].any()
286
+ if any_dates:
287
+ date_stats = [c for c in present_stat_cols if c in ("min", "max")]
288
+
289
+ formatted_data = formatted_data.with_columns(
290
+ nw.when(nw.col("coltype").str.contains(r"\bDate\b", literal=False))
291
+ .then(nw.col(c).cast(nw.String).str.replace_all("-", "<br>"))
292
+ .otherwise(nw.col(c).cast(nw.String))
293
+ for c in date_stats
697
294
  )
698
- .tab_style(
699
- style=style.text(size="10px"),
700
- locations=loc.body(columns=stat_columns),
295
+
296
+ any_datetimes: bool = formatted_data.select(
297
+ __tmp_idx=nw.col("coltype").str.contains("Datetime", literal=True)
298
+ )["__tmp_idx"].any()
299
+ if any_datetimes:
300
+ datetime_idx = [c for c in present_stat_cols if c in ("min", "max")]
301
+ formatted_data = formatted_data.with_columns(
302
+ nw.when(nw.col("coltype").str.contains(r"\bDatetime\b", literal=False))
303
+ .then(nw.col(c).cast(nw.String).str.replace_all("-", "<br>"))
304
+ .otherwise(nw.col(c).cast(nw.String))
305
+ for c in datetime_idx
701
306
  )
702
- .tab_style(
703
- style=style.text(size="14px"),
704
- locations=loc.body(columns="column_number"),
307
+
308
+ # format fractions:
309
+ # this is an anti-pattern but there's no serious alternative
310
+ for _fmt_col in ("__frac_n_unique", "__frac_n_missing"):
311
+ _formatted: list[str | None] = _fmt_frac(formatted_data[_fmt_col])
312
+ formatted: nw.Series = nw.new_series(
313
+ _fmt_col, values=_formatted, backend=self.profile.implementation
705
314
  )
706
- .tab_style(
707
- style=style.text(size="12px"),
708
- locations=loc.body(columns="column_name"),
315
+ formatted_data = formatted_data.drop(_fmt_col)
316
+ formatted_data = formatted_data.with_columns(formatted.alias(_fmt_col))
317
+
318
+ formatted_data = (
319
+ # TODO: This is a temporary solution?
320
+ # Format the unique and missing pct strings
321
+ formatted_data.with_columns(
322
+ n_unique=nw.concat_str(
323
+ nw.col("n_unique"),
324
+ nw.lit("<br>"),
325
+ nw.col("__frac_n_unique"),
326
+ ),
327
+ n_missing=nw.concat_str(
328
+ nw.col("n_missing"),
329
+ nw.lit("<br>"),
330
+ nw.col("__frac_n_missing"),
331
+ ),
709
332
  )
710
- .tab_style(
711
- style=style.css("white-space: pre; overflow-x: visible;"),
712
- locations=loc.body(columns="min"),
333
+ # TODO: Should be able to use selectors for this
334
+ .drop("__frac_n_unique", "__frac_n_missing", "coltype")
335
+ )
336
+
337
+ if "freqs" in formatted_data.columns: # TODO: don't love this arbitrary check
338
+ # Extract HTML freqs:
339
+ try:
340
+ formatted_data = formatted_data.with_columns(
341
+ __freq_true=nw.col("freqs").struct.field("True"),
342
+ __freq_false=nw.col("freqs").struct.field("False"),
343
+ )
344
+ except Exception: # TODO: should be narrowed if possible
345
+ # if no struct implimentation exists, it must be done manually
346
+ freq_ser: nw.Series = formatted_data["freqs"]
347
+ trues: list[int | None] = []
348
+ falses: list[int | None] = []
349
+ for freq in freq_ser:
350
+ try:
351
+ trues.append(freq["True"])
352
+ falses.append(freq["False"])
353
+ except (KeyError, TypeError):
354
+ trues.append(None)
355
+ falses.append(None)
356
+ true_ser: nw.Series = nw.new_series(
357
+ name="__freq_true", values=trues, backend=self.profile.implementation
358
+ )
359
+ false_ser: nw.Series = nw.new_series(
360
+ name="__freq_false", values=falses, backend=self.profile.implementation
361
+ )
362
+ formatted_data = formatted_data.with_columns(
363
+ __freq_true=true_ser, __freq_false=false_ser
364
+ )
365
+
366
+ ## format pct true values
367
+ formatted_data = formatted_data.with_columns(
368
+ # for bools, UQs are represented as percentages
369
+ __pct_true=nw.col("__freq_true") / self.profile.row_count,
370
+ __pct_false=nw.col("__freq_false") / self.profile.row_count,
713
371
  )
714
- .tab_style(
715
- style=style.borders(sides="left", color="#D3D3D3", style="solid"),
716
- locations=loc.body(columns=["missing_vals", "mean", "min", "iqr"]),
372
+ for _fmt_col in ("__pct_true", "__pct_false"):
373
+ _formatted: list[str | None] = _fmt_frac(formatted_data[_fmt_col])
374
+ formatted = nw.new_series(
375
+ name=_fmt_col, values=_formatted, backend=self.profile.implementation
376
+ )
377
+ formatted_data = formatted_data.drop(_fmt_col)
378
+ formatted_data = formatted_data.with_columns(formatted.alias(_fmt_col))
379
+
380
+ formatted_data = (
381
+ formatted_data.with_columns(
382
+ __bool_unique_html=nw.concat_str(
383
+ nw.lit("<span style='font-weight: bold;'>T</span>"),
384
+ nw.col("__pct_true"),
385
+ nw.lit("<br><span style='font-weight: bold;'>F</span>"),
386
+ nw.col("__pct_false"),
387
+ ),
388
+ )
389
+ .with_columns(
390
+ n_unique=nw.when(~nw.col("__bool_unique_html").is_null())
391
+ .then(nw.col("__bool_unique_html"))
392
+ .otherwise(nw.col("n_unique"))
393
+ )
394
+ .drop(
395
+ "__freq_true",
396
+ "__freq_false",
397
+ "__bool_unique_html",
398
+ "freqs",
399
+ "__pct_true",
400
+ "__pct_false",
401
+ )
717
402
  )
403
+
404
+ ## Determine Value Formatting Selectors:
405
+ fmt_int: list[str] = formatted_data.select(nw.selectors.by_dtype(nw.dtypes.Int64)).columns
406
+ fmt_float: list[str] = formatted_data.select(
407
+ nw.selectors.by_dtype(nw.dtypes.Float64)
408
+ ).columns
409
+
410
+ ## GT Table:
411
+ gt_tbl = (
412
+ GT(formatted_data.to_native())
413
+ .tab_header(title=html(combined_title))
414
+ .tab_source_note(source_note="String columns statistics regard the string's length.")
415
+ .cols_align(align="right", columns=list(present_stat_cols))
416
+ .opt_table_font(font=google_font("IBM Plex Sans"))
417
+ .opt_align_table_header(align="left")
418
+ .tab_style(style=style.text(font=google_font("IBM Plex Mono")), locations=loc.body())
419
+ .cols_move_to_start(target_order)
420
+ ## Labeling
421
+ .cols_label(label_map)
422
+ .cols_label(icon="", colname="Column")
423
+ .cols_align("center", columns=list(present_stat_cols))
718
424
  .tab_style(
719
- style=style.borders(sides="left", color="#E5E5E5", style="dashed"),
720
- locations=loc.body(columns=["std_dev", "p05", "q_1", "med", "q_3", "p95", "max"]),
425
+ style=style.text(align="right"), locations=loc.body(columns=list(present_stat_cols))
721
426
  )
722
- .tab_style(
723
- style=style.borders(sides="left", style="none"),
724
- locations=loc.body(
725
- columns=["p05", "q_1", "med", "q_3", "p95", "max"],
726
- rows=datetime_row_list,
727
- ),
427
+ ## Value Formatting
428
+ .fmt_integer(columns=fmt_int)
429
+ .fmt_number(
430
+ columns=fmt_float,
431
+ decimals=2,
432
+ drop_trailing_dec_mark=True,
433
+ drop_trailing_zeros=True,
728
434
  )
435
+ ## Borders
729
436
  .tab_style(
730
- style=style.fill(color="#FCFCFC"),
731
- locations=loc.body(columns=["missing_vals", "unique_vals", "iqr"]),
437
+ style=style.borders(sides="right", color="#D3D3D3", style="solid"),
438
+ locations=loc.body(columns=right_border_cols),
732
439
  )
733
440
  .tab_style(
734
- style=style.text(align="center"), locations=loc.column_labels(columns=stat_columns)
441
+ style=style.borders(sides="left", color="#E5E5E5", style="dashed"),
442
+ locations=loc.body(columns=list(present_stat_cols)),
735
443
  )
736
- .cols_label(
737
- column_number="",
738
- icon="",
739
- column_name="Column",
740
- missing_vals="NA",
741
- unique_vals="UQ",
742
- mean="Mean",
743
- std_dev="SD",
744
- min="Min",
745
- p05=html(
746
- 'P<span style="font-size: 0.75em; vertical-align: sub; position: relative; line-height: 0.5em;">5</span>'
747
- ),
748
- q_1=html(
749
- 'Q<span style="font-size: 0.75em; vertical-align: sub; position: relative; line-height: 0.5em;">1</span>'
750
- ),
751
- med="Med",
752
- q_3=html(
753
- 'Q<span style="font-size: 0.75em; vertical-align: sub; position: relative; line-height: 0.5em;">3</span>'
754
- ),
755
- p95=html(
756
- 'P<span style="font-size: 0.75em; vertical-align: sub; position: relative; line-height: 0.5em;">95</span>'
757
- ),
758
- max="Max",
759
- iqr="IQR",
444
+ ## Formatting
445
+ .tab_style(
446
+ style=style.text(size="10px"),
447
+ locations=loc.body(columns=list(present_stat_cols)),
760
448
  )
449
+ .tab_style(style=style.text(size="12px"), locations=loc.body(columns="colname"))
761
450
  .cols_width(
762
- column_number="40px",
763
- icon="35px",
764
- column_name="200px",
765
- missing_vals="50px",
766
- unique_vals="50px",
767
- mean="50px",
768
- std_dev="50px",
769
- min="50px",
770
- p05="50px",
771
- q_1="50px",
772
- med="50px",
773
- q_3="50px",
774
- p95="50px",
775
- max="50px",
776
- iqr="50px", # 875 px total
451
+ icon="35px", colname="200px", **{stat_col: "60px" for stat_col in present_stat_cols}
777
452
  )
778
453
  )
779
454
 
455
+ if "PYARROW" != formatted_data.implementation.name:
456
+ # TODO: this is more proactive than it should be
457
+ gt_tbl = gt_tbl.sub_missing(missing_text="-")
458
+ # https://github.com/posit-dev/great-tables/issues/667
459
+
780
460
  # If the version of `great_tables` is `>=0.17.0` then disable Quarto table processing
781
461
  if version("great_tables") >= "0.17.0":
782
462
  gt_tbl = gt_tbl.tab_options(quarto_disable_processing=True)
783
463
 
784
464
  return gt_tbl
785
465
 
786
- def to_dict(self) -> dict:
787
- return self.profile
466
+ @staticmethod
467
+ def _build_label_map(cols: Sequence[str]) -> dict[str, Any]:
468
+ label_map: dict[str, Any] = {}
469
+ for target_col in cols:
470
+ try:
471
+ matching_stat = next(
472
+ stat for stat in COLUMN_ORDER_REGISTRY if target_col == stat.name
473
+ )
474
+ except StopIteration:
475
+ continue
476
+ label_map[target_col] = matching_stat.label
477
+ return label_map
788
478
 
789
479
  def to_json(self) -> str:
790
- return json.dumps(self.profile, indent=4)
480
+ prof_dict = self.profile.as_dataframe(strict=False).to_dict(as_series=False)
481
+
482
+ return json.dumps(prof_dict, indent=4, default=str)
791
483
 
792
484
  def save_to_json(self, output_file: str):
485
+ json_string: str = self.to_json()
793
486
  with open(output_file, "w") as f:
794
- json.dump(self.profile, f, indent=4)
487
+ json.dump(json_string, f, indent=4)
795
488
 
796
489
 
797
490
  def col_summary_tbl(data: FrameT | Any, tbl_name: str | None = None) -> GT:
@@ -875,337 +568,3 @@ def col_summary_tbl(data: FrameT | Any, tbl_name: str | None = None) -> GT:
875
568
 
876
569
  scanner = DataScan(data=data, tbl_name=tbl_name)
877
570
  return scanner.get_tabular_report()
878
-
879
-
880
- def _to_df_lib(expr: any, df_lib: str) -> any:
881
- if df_lib == "polars":
882
- return expr.to_polars()
883
- else:
884
- return expr.to_pandas()
885
-
886
-
887
- def _round_to_sig_figs(value: float, sig_figs: int) -> float:
888
- if value == 0:
889
- return 0
890
- return round(value, sig_figs - int(floor(log10(abs(value)))) - 1)
891
-
892
-
893
- def _compact_integer_fmt(value: float | int) -> str:
894
- if value == 0:
895
- formatted = "0"
896
- elif abs(value) >= 1 and abs(value) < 10_000:
897
- formatted = fmt_integer(value, use_seps=False)[0]
898
- else:
899
- formatted = fmt_scientific(value, decimals=1, exp_style="E1")[0]
900
-
901
- return formatted
902
-
903
-
904
- def _compact_decimal_fmt(value: float | int) -> str:
905
- if value == 0:
906
- formatted = "0.00"
907
- elif abs(value) < 1 and abs(value) >= 0.01:
908
- formatted = fmt_number(value, decimals=2)[0]
909
- elif abs(value) < 0.01:
910
- formatted = fmt_scientific(value, decimals=1, exp_style="E1")[0]
911
- elif abs(value) >= 1 and abs(value) < 10:
912
- formatted = fmt_number(value, decimals=2, use_seps=False)[0]
913
- elif abs(value) >= 10 and abs(value) < 1000:
914
- formatted = fmt_number(value, n_sigfig=3)[0]
915
- elif abs(value) >= 1000 and abs(value) < 10_000:
916
- formatted = fmt_number(value, n_sigfig=4, use_seps=False)[0]
917
- else:
918
- formatted = fmt_scientific(value, decimals=1, exp_style="E1")[0]
919
-
920
- return formatted
921
-
922
-
923
- def _compact_0_1_fmt(value: float | int) -> str:
924
- if value == 0:
925
- formatted = " 0.00"
926
- elif value == 1:
927
- formatted = " 1.00"
928
- elif abs(value) < 0.01:
929
- formatted = "<0.01"
930
- elif abs(value) > 0.99 and abs(value) < 1.0:
931
- formatted = ">0.99"
932
- elif abs(value) <= 0.99 and abs(value) >= 0.01:
933
- formatted = " " + fmt_number(value, decimals=2)[0]
934
- else:
935
- formatted = fmt_number(value, n_sigfig=3)[0]
936
- return formatted
937
-
938
-
939
- def _process_numerical_column_data(column_data: dict) -> dict:
940
- column_number = column_data["column_number"]
941
- column_name = column_data["column_name"]
942
- column_type = column_data["column_type"]
943
-
944
- column_name_and_type = (
945
- f"<div style='font-size: 13px; white-space: nowrap; text-overflow: ellipsis; overflow: hidden;'>{column_name}</div>"
946
- f"<div style='font-size: 11px; color: gray;'>{column_type}</div>"
947
- )
948
-
949
- # Get the Missing and Unique value counts and fractions
950
- missing_vals = column_data["n_missing_values"]
951
- unique_vals = column_data["n_unique_values"]
952
- missing_vals_frac = _compact_0_1_fmt(column_data["f_missing_values"])
953
- unique_vals_frac = _compact_0_1_fmt(column_data["f_unique_values"])
954
-
955
- missing_vals_str = f"{missing_vals}<br>{missing_vals_frac}"
956
- unique_vals_str = f"{unique_vals}<br>{unique_vals_frac}"
957
-
958
- # Get the descriptive and quantile statistics
959
- descriptive_stats = column_data["statistics"]["numerical"]["descriptive"]
960
- quantile_stats = column_data["statistics"]["numerical"]["quantiles"]
961
-
962
- # Get all values from the descriptive and quantile stats into a single list
963
- quantile_stats_vals = [v[1] for v in quantile_stats.items()]
964
-
965
- # Determine if the quantile stats are all integerlike
966
- integerlike = []
967
-
968
- # Determine if the quantile stats are integerlike
969
- for val in quantile_stats_vals:
970
- # Check if a quantile value is a number and then if it is intergerlike
971
- if not isinstance(val, (int, float)):
972
- continue # pragma: no cover
973
- else:
974
- integerlike.append(val % 1 == 0)
975
- quantile_vals_integerlike = all(integerlike)
976
-
977
- # Determine the formatter to use for the quantile values
978
- if quantile_vals_integerlike:
979
- q_formatter = _compact_integer_fmt
980
- else:
981
- q_formatter = _compact_decimal_fmt
982
-
983
- # Format the descriptive statistics (mean and standard deviation)
984
- for key, value in descriptive_stats.items():
985
- descriptive_stats[key] = _compact_decimal_fmt(value=value)
986
-
987
- # Format the quantile statistics
988
- for key, value in quantile_stats.items():
989
- quantile_stats[key] = q_formatter(value=value)
990
-
991
- # Create a single dictionary with the statistics for the column
992
- stats_dict = {
993
- "column_number": column_number,
994
- "icon": SVG_ICONS_FOR_DATA_TYPES["numeric"],
995
- "column_name": column_name_and_type,
996
- "missing_vals": missing_vals_str,
997
- "unique_vals": unique_vals_str,
998
- **descriptive_stats,
999
- **quantile_stats,
1000
- }
1001
-
1002
- return stats_dict
1003
-
1004
-
1005
- def _process_string_column_data(column_data: dict) -> dict:
1006
- column_number = column_data["column_number"]
1007
- column_name = column_data["column_name"]
1008
- column_type = column_data["column_type"]
1009
-
1010
- column_name_and_type = (
1011
- f"<div style='font-size: 13px; white-space: nowrap; text-overflow: ellipsis; overflow: hidden;'>{column_name}</div>"
1012
- f"<div style='font-size: 11px; color: gray;'>{column_type}</div>"
1013
- )
1014
-
1015
- # Get the Missing and Unique value counts and fractions
1016
- missing_vals = column_data["n_missing_values"]
1017
- unique_vals = column_data["n_unique_values"]
1018
- missing_vals_frac = _compact_0_1_fmt(column_data["f_missing_values"])
1019
- unique_vals_frac = _compact_0_1_fmt(column_data["f_unique_values"])
1020
-
1021
- missing_vals_str = f"{missing_vals}<br>{missing_vals_frac}"
1022
- unique_vals_str = f"{unique_vals}<br>{unique_vals_frac}"
1023
-
1024
- # Get the descriptive and quantile statistics
1025
- descriptive_stats = column_data["statistics"]["string_lengths"]["descriptive"]
1026
- quantile_stats = column_data["statistics"]["string_lengths"]["quantiles"]
1027
-
1028
- # Format the descriptive statistics (mean and standard deviation)
1029
- for key, value in descriptive_stats.items():
1030
- formatted_val = _compact_decimal_fmt(value=value)
1031
- descriptive_stats[key] = (
1032
- f'<div><div>{formatted_val}</div><div style="float: left; position: absolute;">'
1033
- '<div title="string length measure" style="font-size: 7px; color: #999; '
1034
- 'font-style: italic; cursor: help;">SL</div></div></div>'
1035
- )
1036
-
1037
- # Format the quantile statistics
1038
- for key, value in quantile_stats.items():
1039
- formatted_val = _compact_integer_fmt(value=value)
1040
- quantile_stats[key] = (
1041
- f'<div><div>{formatted_val}</div><div style="float: left; position: absolute;">'
1042
- '<div title="string length measure" style="font-size: 7px; color: #999; '
1043
- 'font-style: italic; cursor: help;">SL</div></div></div>'
1044
- )
1045
-
1046
- # Create a single dictionary with the statistics for the column
1047
- stats_dict = {
1048
- "column_number": column_number,
1049
- "icon": SVG_ICONS_FOR_DATA_TYPES["string"],
1050
- "column_name": column_name_and_type,
1051
- "missing_vals": missing_vals_str,
1052
- "unique_vals": unique_vals_str,
1053
- **descriptive_stats,
1054
- "min": quantile_stats["min"],
1055
- "p05": "&mdash;",
1056
- "q_1": "&mdash;",
1057
- "med": quantile_stats["med"],
1058
- "q_3": "&mdash;",
1059
- "p95": "&mdash;",
1060
- "max": quantile_stats["max"],
1061
- "iqr": "&mdash;",
1062
- }
1063
-
1064
- return stats_dict
1065
-
1066
-
1067
- def _process_datetime_column_data(column_data: dict) -> dict:
1068
- column_number = column_data["column_number"]
1069
- column_name = column_data["column_name"]
1070
- column_type = column_data["column_type"]
1071
-
1072
- long_column_type = len(column_type) > 22
1073
-
1074
- if long_column_type:
1075
- column_type_style = "font-size: 7.5px; color: gray; margin-top: 3px; margin-bottom: 2px;"
1076
- else:
1077
- column_type_style = "font-size: 11px; color: gray;"
1078
-
1079
- column_name_and_type = (
1080
- f"<div style='font-size: 13px; white-space: nowrap; text-overflow: ellipsis; overflow: hidden;'>{column_name}</div>"
1081
- f"<div style='{column_type_style}'>{column_type}</div>"
1082
- )
1083
-
1084
- # Get the Missing and Unique value counts and fractions
1085
- missing_vals = column_data["n_missing_values"]
1086
- unique_vals = column_data["n_unique_values"]
1087
- missing_vals_frac = _compact_0_1_fmt(column_data["f_missing_values"])
1088
- unique_vals_frac = _compact_0_1_fmt(column_data["f_unique_values"])
1089
-
1090
- missing_vals_str = f"{missing_vals}<br>{missing_vals_frac}"
1091
- unique_vals_str = f"{unique_vals}<br>{unique_vals_frac}"
1092
-
1093
- # Get the min and max date
1094
- min_date = column_data["statistics"]["datetime"]["min"]
1095
- max_date = column_data["statistics"]["datetime"]["max"]
1096
-
1097
- # Format the dates so that they don't break across lines
1098
- min_max_date_str = f"<span style='text-align: left; white-space: nowrap; overflow-x: visible;'>&nbsp;{min_date} &ndash; {max_date}</span>"
1099
-
1100
- # Create a single dictionary with the statistics for the column
1101
- stats_dict = {
1102
- "column_number": column_number,
1103
- "icon": SVG_ICONS_FOR_DATA_TYPES["date"],
1104
- "column_name": column_name_and_type,
1105
- "missing_vals": missing_vals_str,
1106
- "unique_vals": unique_vals_str,
1107
- "mean": "&mdash;",
1108
- "std_dev": "&mdash;",
1109
- "min": min_max_date_str,
1110
- "p05": "",
1111
- "q_1": "",
1112
- "med": "",
1113
- "q_3": "",
1114
- "p95": "",
1115
- "max": "",
1116
- "iqr": "&mdash;",
1117
- }
1118
-
1119
- return stats_dict
1120
-
1121
-
1122
- def _process_boolean_column_data(column_data: dict) -> dict:
1123
- column_number = column_data["column_number"]
1124
- column_name = column_data["column_name"]
1125
- column_type = column_data["column_type"]
1126
-
1127
- column_name_and_type = (
1128
- f"<div style='font-size: 13px; white-space: nowrap; text-overflow: ellipsis; overflow: hidden;'>{column_name}</div>"
1129
- f"<div style='font-size: 11px; color: gray;'>{column_type}</div>"
1130
- )
1131
-
1132
- # Get the missing value count and fraction
1133
- missing_vals = column_data["n_missing_values"]
1134
- missing_vals_frac = _compact_0_1_fmt(column_data["f_missing_values"])
1135
- missing_vals_str = f"{missing_vals}<br>{missing_vals_frac}"
1136
-
1137
- # Get the fractions of True and False values
1138
- f_true_values = column_data["statistics"]["boolean"]["f_true_values"]
1139
- f_false_values = column_data["statistics"]["boolean"]["f_false_values"]
1140
-
1141
- true_vals_frac_fmt = _compact_0_1_fmt(f_true_values)
1142
- false_vals_frac_fmt = _compact_0_1_fmt(f_false_values)
1143
-
1144
- # Create an HTML string that combines fractions for the True and False values; this will be
1145
- # used in the Unique Vals column of the report table
1146
- true_false_vals_str = (
1147
- f"<span style='font-weight: bold;'>T</span>{true_vals_frac_fmt}<br>"
1148
- f"<span style='font-weight: bold;'>F</span>{false_vals_frac_fmt}"
1149
- )
1150
-
1151
- # Create a single dictionary with the statistics for the column
1152
- stats_dict = {
1153
- "column_number": column_number,
1154
- "icon": SVG_ICONS_FOR_DATA_TYPES["boolean"],
1155
- "column_name": column_name_and_type,
1156
- "missing_vals": missing_vals_str,
1157
- "unique_vals": true_false_vals_str,
1158
- "mean": "&mdash;",
1159
- "std_dev": "&mdash;",
1160
- "min": "&mdash;",
1161
- "p05": "&mdash;",
1162
- "q_1": "&mdash;",
1163
- "med": "&mdash;",
1164
- "q_3": "&mdash;",
1165
- "p95": "&mdash;",
1166
- "max": "&mdash;",
1167
- "iqr": "&mdash;",
1168
- }
1169
-
1170
- return stats_dict
1171
-
1172
-
1173
- def _process_other_column_data(column_data: dict) -> dict:
1174
- column_number = column_data["column_number"]
1175
- column_name = column_data["column_name"]
1176
- column_type = column_data["column_type"]
1177
-
1178
- column_name_and_type = (
1179
- f"<div style='font-size: 13px; white-space: nowrap; text-overflow: ellipsis; overflow: hidden;'>{column_name}</div>"
1180
- f"<div style='font-size: 11px; color: gray;'>{column_type}</div>"
1181
- )
1182
-
1183
- # Get the Missing and Unique value counts and fractions
1184
- missing_vals = column_data["n_missing_values"]
1185
- unique_vals = column_data["n_unique_values"]
1186
- missing_vals_frac = _compact_decimal_fmt(column_data["f_missing_values"])
1187
- unique_vals_frac = _compact_decimal_fmt(column_data["f_unique_values"])
1188
-
1189
- missing_vals_str = f"{missing_vals}<br>{missing_vals_frac}"
1190
- unique_vals_str = f"{unique_vals}<br>{unique_vals_frac}"
1191
-
1192
- # Create a single dictionary with the statistics for the column
1193
- stats_dict = {
1194
- "column_number": column_number,
1195
- "icon": SVG_ICONS_FOR_DATA_TYPES["object"],
1196
- "column_name": column_name_and_type,
1197
- "missing_vals": missing_vals_str,
1198
- "unique_vals": unique_vals_str,
1199
- "mean": "&mdash;",
1200
- "std_dev": "&mdash;",
1201
- "min": "&mdash;",
1202
- "p05": "&mdash;",
1203
- "q_1": "&mdash;",
1204
- "med": "&mdash;",
1205
- "q_3": "&mdash;",
1206
- "p95": "&mdash;",
1207
- "max": "&mdash;",
1208
- "iqr": "&mdash;",
1209
- }
1210
-
1211
- return stats_dict