pointblank 0.9.6__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +4 -0
- pointblank/_constants.py +4 -0
- pointblank/_datascan_utils.py +65 -0
- pointblank/_utils.py +126 -0
- pointblank/_utils_html.py +40 -0
- pointblank/assistant.py +1 -3
- pointblank/compare.py +27 -0
- pointblank/data/api-docs.txt +518 -125
- pointblank/datascan.py +318 -959
- pointblank/scan_profile.py +321 -0
- pointblank/scan_profile_stats.py +180 -0
- pointblank/schema.py +14 -3
- pointblank/validate.py +1425 -202
- {pointblank-0.9.6.dist-info → pointblank-0.10.0.dist-info}/METADATA +4 -3
- {pointblank-0.9.6.dist-info → pointblank-0.10.0.dist-info}/RECORD +18 -14
- {pointblank-0.9.6.dist-info → pointblank-0.10.0.dist-info}/WHEEL +1 -1
- {pointblank-0.9.6.dist-info → pointblank-0.10.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.9.6.dist-info → pointblank-0.10.0.dist-info}/top_level.txt +0 -0
pointblank/datascan.py
CHANGED
|
@@ -1,24 +1,31 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import contextlib
|
|
3
4
|
import json
|
|
4
|
-
from dataclasses import dataclass, field
|
|
5
5
|
from importlib.metadata import version
|
|
6
|
-
from
|
|
7
|
-
from typing import Any
|
|
6
|
+
from typing import TYPE_CHECKING, Any
|
|
8
7
|
|
|
9
8
|
import narwhals as nw
|
|
10
9
|
from great_tables import GT, google_font, html, loc, style
|
|
11
|
-
from
|
|
10
|
+
from narwhals.dataframe import LazyFrame
|
|
12
11
|
from narwhals.typing import FrameT
|
|
13
12
|
|
|
14
|
-
from pointblank.
|
|
15
|
-
from pointblank.
|
|
16
|
-
from pointblank.
|
|
13
|
+
from pointblank._utils_html import _create_table_dims_html, _create_table_type_html, _fmt_frac
|
|
14
|
+
from pointblank.scan_profile import ColumnProfile, _as_physical, _DataProfile, _TypeMap
|
|
15
|
+
from pointblank.scan_profile_stats import COLUMN_ORDER_REGISTRY
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from collections.abc import Mapping, Sequence
|
|
19
|
+
|
|
20
|
+
from narwhals.dataframe import DataFrame
|
|
21
|
+
from narwhals.typing import Frame, IntoFrameT
|
|
22
|
+
|
|
23
|
+
from pointblank.scan_profile_stats import StatGroup
|
|
24
|
+
|
|
17
25
|
|
|
18
26
|
__all__ = ["DataScan", "col_summary_tbl"]
|
|
19
27
|
|
|
20
28
|
|
|
21
|
-
@dataclass
|
|
22
29
|
class DataScan:
|
|
23
30
|
"""
|
|
24
31
|
Get a summary of a dataset.
|
|
@@ -113,565 +120,92 @@ class DataScan:
|
|
|
113
120
|
A DataScan object.
|
|
114
121
|
"""
|
|
115
122
|
|
|
116
|
-
|
|
117
|
-
tbl_name: str | None = None
|
|
118
|
-
|
|
119
|
-
tbl_category: str = field(init=False)
|
|
120
|
-
tbl_type: str = field(init=False)
|
|
121
|
-
profile: dict = field(init=False)
|
|
122
|
-
|
|
123
|
-
def __post_init__(self):
|
|
124
|
-
# Determine if the data is a DataFrame that could be handled by Narwhals,
|
|
125
|
-
# or an Ibis Table
|
|
126
|
-
self.tbl_type = _get_tbl_type(data=self.data)
|
|
127
|
-
ibis_tbl = "ibis.expr.types.relations.Table" in str(type(self.data))
|
|
128
|
-
pl_pd_tbl = "polars" in self.tbl_type or "pandas" in self.tbl_type
|
|
129
|
-
|
|
130
|
-
# Set the table category based on the type of table (this will be used to determine
|
|
131
|
-
# how to handle the data)
|
|
132
|
-
if ibis_tbl:
|
|
133
|
-
self.tbl_category = "ibis"
|
|
134
|
-
else:
|
|
135
|
-
self.tbl_category = "dataframe"
|
|
136
|
-
|
|
137
|
-
# If the data is DataFrame, convert it to a Narwhals DataFrame
|
|
138
|
-
if pl_pd_tbl:
|
|
139
|
-
self.data_alt = nw.from_native(self.data)
|
|
140
|
-
else:
|
|
141
|
-
self.data_alt = None
|
|
142
|
-
|
|
143
|
-
# Generate the profile based on the `tbl_category` value
|
|
144
|
-
if self.tbl_category == "dataframe":
|
|
145
|
-
self.profile = self._generate_profile_df()
|
|
146
|
-
|
|
147
|
-
if self.tbl_category == "ibis":
|
|
148
|
-
self.profile = self._generate_profile_ibis()
|
|
149
|
-
|
|
150
|
-
def _generate_profile_df(self) -> dict:
|
|
151
|
-
profile = {}
|
|
152
|
-
|
|
153
|
-
if self.tbl_name:
|
|
154
|
-
profile["tbl_name"] = self.tbl_name
|
|
155
|
-
|
|
156
|
-
row_count = self.data_alt.shape[0]
|
|
157
|
-
column_count = self.data_alt.shape[1]
|
|
158
|
-
|
|
159
|
-
profile.update(
|
|
160
|
-
{
|
|
161
|
-
"tbl_type": self.tbl_type,
|
|
162
|
-
"dimensions": {"rows": row_count, "columns": column_count},
|
|
163
|
-
"columns": [],
|
|
164
|
-
}
|
|
165
|
-
)
|
|
166
|
-
|
|
167
|
-
for idx, column in enumerate(self.data_alt.columns):
|
|
168
|
-
col_data = self.data_alt[column]
|
|
169
|
-
native_dtype = str(self.data[column].dtype)
|
|
170
|
-
|
|
171
|
-
#
|
|
172
|
-
# Collection of sample data
|
|
173
|
-
#
|
|
174
|
-
if "date" in str(col_data.dtype).lower():
|
|
175
|
-
sample_data = col_data.drop_nulls().head(5).cast(nw.String).to_list()
|
|
176
|
-
sample_data = [str(x) for x in sample_data]
|
|
177
|
-
else:
|
|
178
|
-
sample_data = col_data.drop_nulls().head(5).to_list()
|
|
179
|
-
|
|
180
|
-
n_missing_vals = int(col_data.is_null().sum())
|
|
181
|
-
n_unique_vals = int(col_data.n_unique())
|
|
182
|
-
|
|
183
|
-
# If there are missing values, subtract 1 from the number of unique values
|
|
184
|
-
# to account for the missing value which shouldn't be included in the count
|
|
185
|
-
if (n_missing_vals > 0) and (n_unique_vals > 0):
|
|
186
|
-
n_unique_vals = n_unique_vals - 1
|
|
187
|
-
|
|
188
|
-
f_missing_vals = _round_to_sig_figs(n_missing_vals / row_count, 3)
|
|
189
|
-
f_unique_vals = _round_to_sig_figs(n_unique_vals / row_count, 3)
|
|
190
|
-
|
|
191
|
-
col_profile = {
|
|
192
|
-
"column_name": column,
|
|
193
|
-
"column_type": native_dtype,
|
|
194
|
-
"column_number": idx + 1,
|
|
195
|
-
"n_missing_values": n_missing_vals,
|
|
196
|
-
"f_missing_values": f_missing_vals,
|
|
197
|
-
"n_unique_values": n_unique_vals,
|
|
198
|
-
"f_unique_values": f_unique_vals,
|
|
199
|
-
}
|
|
200
|
-
|
|
201
|
-
#
|
|
202
|
-
# Numerical columns
|
|
203
|
-
#
|
|
204
|
-
if "int" in str(col_data.dtype).lower() or "float" in str(col_data.dtype).lower():
|
|
205
|
-
n_negative_vals = int(col_data.is_between(-1e26, -1e-26).sum())
|
|
206
|
-
f_negative_vals = _round_to_sig_figs(n_negative_vals / row_count, 3)
|
|
207
|
-
|
|
208
|
-
n_zero_vals = int(col_data.is_between(0, 0).sum())
|
|
209
|
-
f_zero_vals = _round_to_sig_figs(n_zero_vals / row_count, 3)
|
|
210
|
-
|
|
211
|
-
n_positive_vals = row_count - n_missing_vals - n_negative_vals - n_zero_vals
|
|
212
|
-
f_positive_vals = _round_to_sig_figs(n_positive_vals / row_count, 3)
|
|
213
|
-
|
|
214
|
-
col_profile_additional = {
|
|
215
|
-
"n_negative_values": n_negative_vals,
|
|
216
|
-
"f_negative_values": f_negative_vals,
|
|
217
|
-
"n_zero_values": n_zero_vals,
|
|
218
|
-
"f_zero_values": f_zero_vals,
|
|
219
|
-
"n_positive_values": n_positive_vals,
|
|
220
|
-
"f_positive_values": f_positive_vals,
|
|
221
|
-
"sample_data": sample_data,
|
|
222
|
-
}
|
|
223
|
-
col_profile.update(col_profile_additional)
|
|
224
|
-
|
|
225
|
-
col_profile_stats = {
|
|
226
|
-
"statistics": {
|
|
227
|
-
"numerical": {
|
|
228
|
-
"descriptive": {
|
|
229
|
-
"mean": round(float(col_data.mean()), 2),
|
|
230
|
-
"std_dev": round(float(col_data.std()), 4),
|
|
231
|
-
},
|
|
232
|
-
"quantiles": {
|
|
233
|
-
"min": float(col_data.min()),
|
|
234
|
-
"p05": round(
|
|
235
|
-
float(col_data.quantile(0.05, interpolation="linear")), 2
|
|
236
|
-
),
|
|
237
|
-
"q_1": round(
|
|
238
|
-
float(col_data.quantile(0.25, interpolation="linear")), 2
|
|
239
|
-
),
|
|
240
|
-
"med": float(col_data.median()),
|
|
241
|
-
"q_3": round(
|
|
242
|
-
float(col_data.quantile(0.75, interpolation="linear")), 2
|
|
243
|
-
),
|
|
244
|
-
"p95": round(
|
|
245
|
-
float(col_data.quantile(0.95, interpolation="linear")), 2
|
|
246
|
-
),
|
|
247
|
-
"max": float(col_data.max()),
|
|
248
|
-
"iqr": round(
|
|
249
|
-
float(col_data.quantile(0.75, interpolation="linear"))
|
|
250
|
-
- float(col_data.quantile(0.25, interpolation="linear")),
|
|
251
|
-
2,
|
|
252
|
-
),
|
|
253
|
-
},
|
|
254
|
-
}
|
|
255
|
-
}
|
|
256
|
-
}
|
|
257
|
-
col_profile.update(col_profile_stats)
|
|
258
|
-
|
|
259
|
-
#
|
|
260
|
-
# String columns
|
|
261
|
-
#
|
|
262
|
-
elif (
|
|
263
|
-
"string" in str(col_data.dtype).lower()
|
|
264
|
-
or "categorical" in str(col_data.dtype).lower()
|
|
265
|
-
):
|
|
266
|
-
col_profile_additional = {
|
|
267
|
-
"sample_data": sample_data,
|
|
268
|
-
}
|
|
269
|
-
col_profile.update(col_profile_additional)
|
|
270
|
-
|
|
271
|
-
# Transform `col_data` to a column of string lengths
|
|
272
|
-
col_str_len_data = col_data.str.len_chars()
|
|
273
|
-
|
|
274
|
-
col_profile_stats = {
|
|
275
|
-
"statistics": {
|
|
276
|
-
"string_lengths": {
|
|
277
|
-
"descriptive": {
|
|
278
|
-
"mean": round(float(col_str_len_data.mean()), 2),
|
|
279
|
-
"std_dev": round(float(col_str_len_data.std()), 4),
|
|
280
|
-
},
|
|
281
|
-
"quantiles": {
|
|
282
|
-
"min": int(col_str_len_data.min()),
|
|
283
|
-
"p05": int(col_str_len_data.quantile(0.05, interpolation="linear")),
|
|
284
|
-
"q_1": int(col_str_len_data.quantile(0.25, interpolation="linear")),
|
|
285
|
-
"med": int(col_str_len_data.median()),
|
|
286
|
-
"q_3": int(col_str_len_data.quantile(0.75, interpolation="linear")),
|
|
287
|
-
"p95": int(col_str_len_data.quantile(0.95, interpolation="linear")),
|
|
288
|
-
"max": int(col_str_len_data.max()),
|
|
289
|
-
"iqr": int(col_str_len_data.quantile(0.75, interpolation="linear"))
|
|
290
|
-
- int(col_str_len_data.quantile(0.25, interpolation="linear")),
|
|
291
|
-
},
|
|
292
|
-
}
|
|
293
|
-
}
|
|
294
|
-
}
|
|
295
|
-
col_profile.update(col_profile_stats)
|
|
296
|
-
|
|
297
|
-
#
|
|
298
|
-
# Date and datetime columns
|
|
299
|
-
#
|
|
300
|
-
elif "date" in str(col_data.dtype).lower():
|
|
301
|
-
col_profile_additional = {
|
|
302
|
-
"sample_data": sample_data,
|
|
303
|
-
}
|
|
304
|
-
col_profile.update(col_profile_additional)
|
|
305
|
-
|
|
306
|
-
min_date = str(col_data.min())
|
|
307
|
-
max_date = str(col_data.max())
|
|
308
|
-
|
|
309
|
-
col_profile_stats = {
|
|
310
|
-
"statistics": {
|
|
311
|
-
"datetime": {
|
|
312
|
-
"min": min_date,
|
|
313
|
-
"max": max_date,
|
|
314
|
-
}
|
|
315
|
-
}
|
|
316
|
-
}
|
|
317
|
-
col_profile.update(col_profile_stats)
|
|
318
|
-
|
|
319
|
-
#
|
|
320
|
-
# Boolean columns
|
|
321
|
-
#
|
|
322
|
-
elif "bool" in str(col_data.dtype).lower():
|
|
323
|
-
col_profile_additional = {
|
|
324
|
-
"sample_data": sample_data,
|
|
325
|
-
}
|
|
326
|
-
col_profile.update(col_profile_additional)
|
|
327
|
-
|
|
328
|
-
n_true_values = int(col_data.sum())
|
|
329
|
-
f_true_values = _round_to_sig_figs(n_true_values / row_count, 3)
|
|
330
|
-
|
|
331
|
-
n_false_values = row_count - n_missing_vals - n_true_values
|
|
332
|
-
f_false_values = _round_to_sig_figs(n_false_values / row_count, 3)
|
|
333
|
-
|
|
334
|
-
col_profile_stats = {
|
|
335
|
-
"statistics": {
|
|
336
|
-
"boolean": {
|
|
337
|
-
"n_true_values": n_true_values,
|
|
338
|
-
"f_true_values": f_true_values,
|
|
339
|
-
"n_false_values": n_false_values,
|
|
340
|
-
"f_false_values": f_false_values,
|
|
341
|
-
}
|
|
342
|
-
}
|
|
343
|
-
}
|
|
344
|
-
col_profile.update(col_profile_stats)
|
|
345
|
-
|
|
346
|
-
profile["columns"].append(col_profile)
|
|
347
|
-
|
|
348
|
-
return profile
|
|
349
|
-
|
|
350
|
-
def _generate_profile_ibis(self) -> dict:
|
|
351
|
-
profile = {}
|
|
123
|
+
# TODO: This needs to be generically typed at the class level, ie. DataScan[T]
|
|
124
|
+
def __init__(self, data: IntoFrameT, tbl_name: str | None = None) -> None:
|
|
125
|
+
as_native = nw.from_native(data)
|
|
352
126
|
|
|
353
|
-
if
|
|
354
|
-
|
|
127
|
+
if as_native.implementation.name == "IBIS" and as_native._level == "lazy":
|
|
128
|
+
assert isinstance(as_native, LazyFrame) # help mypy
|
|
355
129
|
|
|
356
|
-
|
|
130
|
+
ibis_native = as_native.to_native()
|
|
357
131
|
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
"columns": [],
|
|
366
|
-
}
|
|
367
|
-
)
|
|
368
|
-
|
|
369
|
-
# Determine which DataFrame library is available
|
|
370
|
-
df_lib = _select_df_lib(preference="polars")
|
|
371
|
-
df_lib_str = str(df_lib)
|
|
372
|
-
|
|
373
|
-
if "polars" in df_lib_str:
|
|
374
|
-
df_lib_use = "polars"
|
|
375
|
-
else:
|
|
376
|
-
df_lib_use = "pandas"
|
|
377
|
-
|
|
378
|
-
column_dtypes = list(self.data.schema().items())
|
|
379
|
-
|
|
380
|
-
for idx, column in enumerate(self.data.columns):
|
|
381
|
-
dtype_str = str(column_dtypes[idx][1])
|
|
382
|
-
|
|
383
|
-
col_data = self.data[column]
|
|
384
|
-
col_data_no_null = self.data.drop_null().head(5)[column]
|
|
385
|
-
|
|
386
|
-
#
|
|
387
|
-
# Collection of sample data
|
|
388
|
-
#
|
|
389
|
-
if "date" in dtype_str.lower() or "timestamp" in dtype_str.lower():
|
|
390
|
-
if df_lib_use == "polars":
|
|
391
|
-
import polars as pl
|
|
392
|
-
|
|
393
|
-
sample_data = col_data_no_null.to_polars().cast(pl.String).to_list()
|
|
394
|
-
else:
|
|
395
|
-
sample_data = col_data_no_null.to_pandas().astype(str).to_list()
|
|
132
|
+
valid_conversion_methods = ("to_pyarrow", "to_pandas", "to_polars")
|
|
133
|
+
for conv_method in valid_conversion_methods:
|
|
134
|
+
try:
|
|
135
|
+
valid_native = getattr(ibis_native, conv_method)()
|
|
136
|
+
except (NotImplementedError, ImportError, ModuleNotFoundError):
|
|
137
|
+
continue
|
|
138
|
+
break
|
|
396
139
|
else:
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
n_missing_vals = int(_to_df_lib(col_data.isnull().sum(), df_lib=df_lib_use))
|
|
403
|
-
n_unique_vals = int(_to_df_lib(col_data.nunique(), df_lib=df_lib_use))
|
|
404
|
-
|
|
405
|
-
# If there are missing values, subtract 1 from the number of unique values
|
|
406
|
-
# to account for the missing value which shouldn't be included in the count
|
|
407
|
-
if (n_missing_vals > 0) and (n_unique_vals > 0):
|
|
408
|
-
n_unique_vals = n_unique_vals - 1
|
|
409
|
-
|
|
410
|
-
f_missing_vals = _round_to_sig_figs(n_missing_vals / row_count, 3)
|
|
411
|
-
f_unique_vals = _round_to_sig_figs(n_unique_vals / row_count, 3)
|
|
412
|
-
|
|
413
|
-
col_profile = {
|
|
414
|
-
"column_name": column,
|
|
415
|
-
"column_type": dtype_str,
|
|
416
|
-
"column_number": idx + 1,
|
|
417
|
-
"n_missing_values": n_missing_vals,
|
|
418
|
-
"f_missing_values": f_missing_vals,
|
|
419
|
-
"n_unique_values": n_unique_vals,
|
|
420
|
-
"f_unique_values": f_unique_vals,
|
|
421
|
-
}
|
|
422
|
-
|
|
423
|
-
#
|
|
424
|
-
# Numerical columns
|
|
425
|
-
#
|
|
426
|
-
if "int" in dtype_str.lower() or "float" in dtype_str.lower():
|
|
427
|
-
n_negative_vals = int(
|
|
428
|
-
_to_df_lib(col_data.between(-1e26, -1e-26).sum(), df_lib=df_lib_use)
|
|
140
|
+
msg = (
|
|
141
|
+
"To use `ibis` as input, you must have one of arrow, pandas, polars or numpy "
|
|
142
|
+
"available in the process. Until `ibis` is fully supported by Narwhals, this is "
|
|
143
|
+
"necessary. Additionally, the data must be collected in order to calculate some "
|
|
144
|
+
"structural statistics, which may be performance detrimental."
|
|
429
145
|
)
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
n_zero_vals = int(_to_df_lib(col_data.between(0, 0).sum(), df_lib=df_lib_use))
|
|
433
|
-
f_zero_vals = _round_to_sig_figs(n_zero_vals / row_count, 3)
|
|
434
|
-
|
|
435
|
-
n_positive_vals = row_count - n_missing_vals - n_negative_vals - n_zero_vals
|
|
436
|
-
f_positive_vals = _round_to_sig_figs(n_positive_vals / row_count, 3)
|
|
437
|
-
|
|
438
|
-
col_profile_additional = {
|
|
439
|
-
"n_negative_values": n_negative_vals,
|
|
440
|
-
"f_negative_values": f_negative_vals,
|
|
441
|
-
"n_zero_values": n_zero_vals,
|
|
442
|
-
"f_zero_values": f_zero_vals,
|
|
443
|
-
"n_positive_values": n_positive_vals,
|
|
444
|
-
"f_positive_values": f_positive_vals,
|
|
445
|
-
"sample_data": sample_data,
|
|
446
|
-
}
|
|
447
|
-
col_profile.update(col_profile_additional)
|
|
448
|
-
|
|
449
|
-
col_profile_stats = {
|
|
450
|
-
"statistics": {
|
|
451
|
-
"numerical": {
|
|
452
|
-
"descriptive": {
|
|
453
|
-
"mean": round(_to_df_lib(col_data.mean(), df_lib=df_lib_use), 2),
|
|
454
|
-
"std_dev": round(_to_df_lib(col_data.std(), df_lib=df_lib_use), 4),
|
|
455
|
-
},
|
|
456
|
-
"quantiles": {
|
|
457
|
-
"min": _to_df_lib(col_data.min(), df_lib=df_lib_use),
|
|
458
|
-
"p05": round(
|
|
459
|
-
_to_df_lib(col_data.approx_quantile(0.05), df_lib=df_lib_use),
|
|
460
|
-
2,
|
|
461
|
-
),
|
|
462
|
-
"q_1": round(
|
|
463
|
-
_to_df_lib(col_data.approx_quantile(0.25), df_lib=df_lib_use),
|
|
464
|
-
2,
|
|
465
|
-
),
|
|
466
|
-
"med": _to_df_lib(col_data.median(), df_lib=df_lib_use),
|
|
467
|
-
"q_3": round(
|
|
468
|
-
_to_df_lib(col_data.approx_quantile(0.75), df_lib=df_lib_use),
|
|
469
|
-
2,
|
|
470
|
-
),
|
|
471
|
-
"p95": round(
|
|
472
|
-
_to_df_lib(col_data.approx_quantile(0.95), df_lib=df_lib_use),
|
|
473
|
-
2,
|
|
474
|
-
),
|
|
475
|
-
"max": _to_df_lib(col_data.max(), df_lib=df_lib_use),
|
|
476
|
-
"iqr": round(
|
|
477
|
-
_to_df_lib(col_data.quantile(0.75), df_lib=df_lib_use)
|
|
478
|
-
- _to_df_lib(col_data.quantile(0.25), df_lib=df_lib_use),
|
|
479
|
-
2,
|
|
480
|
-
),
|
|
481
|
-
},
|
|
482
|
-
}
|
|
483
|
-
}
|
|
484
|
-
}
|
|
485
|
-
col_profile.update(col_profile_stats)
|
|
486
|
-
|
|
487
|
-
#
|
|
488
|
-
# String columns
|
|
489
|
-
#
|
|
490
|
-
elif "string" in dtype_str.lower() or "char" in dtype_str.lower():
|
|
491
|
-
col_profile_additional = {
|
|
492
|
-
"sample_data": sample_data,
|
|
493
|
-
}
|
|
494
|
-
col_profile.update(col_profile_additional)
|
|
495
|
-
|
|
496
|
-
# Transform `col_data` to a column of string lengths
|
|
497
|
-
col_str_len_data = col_data.length()
|
|
498
|
-
|
|
499
|
-
col_profile_stats = {
|
|
500
|
-
"statistics": {
|
|
501
|
-
"string_lengths": {
|
|
502
|
-
"descriptive": {
|
|
503
|
-
"mean": round(
|
|
504
|
-
float(_to_df_lib(col_str_len_data.mean(), df_lib=df_lib_use)), 2
|
|
505
|
-
),
|
|
506
|
-
"std_dev": round(
|
|
507
|
-
float(_to_df_lib(col_str_len_data.std(), df_lib=df_lib_use)), 4
|
|
508
|
-
),
|
|
509
|
-
},
|
|
510
|
-
"quantiles": {
|
|
511
|
-
"min": int(_to_df_lib(col_str_len_data.min(), df_lib=df_lib_use)),
|
|
512
|
-
"p05": int(
|
|
513
|
-
_to_df_lib(
|
|
514
|
-
col_str_len_data.approx_quantile(0.05),
|
|
515
|
-
df_lib=df_lib_use,
|
|
516
|
-
)
|
|
517
|
-
),
|
|
518
|
-
"q_1": int(
|
|
519
|
-
_to_df_lib(
|
|
520
|
-
col_str_len_data.approx_quantile(0.25),
|
|
521
|
-
df_lib=df_lib_use,
|
|
522
|
-
)
|
|
523
|
-
),
|
|
524
|
-
"med": int(
|
|
525
|
-
_to_df_lib(col_str_len_data.median(), df_lib=df_lib_use)
|
|
526
|
-
),
|
|
527
|
-
"q_3": int(
|
|
528
|
-
_to_df_lib(
|
|
529
|
-
col_str_len_data.approx_quantile(0.75),
|
|
530
|
-
df_lib=df_lib_use,
|
|
531
|
-
)
|
|
532
|
-
),
|
|
533
|
-
"p95": int(
|
|
534
|
-
_to_df_lib(
|
|
535
|
-
col_str_len_data.approx_quantile(0.95),
|
|
536
|
-
df_lib=df_lib_use,
|
|
537
|
-
)
|
|
538
|
-
),
|
|
539
|
-
"max": int(_to_df_lib(col_str_len_data.max(), df_lib=df_lib_use)),
|
|
540
|
-
"iqr": int(
|
|
541
|
-
_to_df_lib(
|
|
542
|
-
col_str_len_data.approx_quantile(0.75),
|
|
543
|
-
df_lib=df_lib_use,
|
|
544
|
-
)
|
|
545
|
-
)
|
|
546
|
-
- int(
|
|
547
|
-
_to_df_lib(
|
|
548
|
-
col_str_len_data.approx_quantile(0.25),
|
|
549
|
-
df_lib=df_lib_use,
|
|
550
|
-
)
|
|
551
|
-
),
|
|
552
|
-
},
|
|
553
|
-
}
|
|
554
|
-
}
|
|
555
|
-
}
|
|
556
|
-
col_profile.update(col_profile_stats)
|
|
557
|
-
|
|
558
|
-
#
|
|
559
|
-
# Date and datetime columns
|
|
560
|
-
#
|
|
561
|
-
elif "date" in dtype_str.lower() or "timestamp" in dtype_str.lower():
|
|
562
|
-
col_profile_additional = {
|
|
563
|
-
"sample_data": sample_data,
|
|
564
|
-
}
|
|
565
|
-
col_profile.update(col_profile_additional)
|
|
566
|
-
|
|
567
|
-
min_date = _to_df_lib(col_data.min(), df_lib=df_lib_use)
|
|
568
|
-
max_date = _to_df_lib(col_data.max(), df_lib=df_lib_use)
|
|
569
|
-
|
|
570
|
-
col_profile_stats = {
|
|
571
|
-
"statistics": {
|
|
572
|
-
"datetime": {
|
|
573
|
-
"min": str(min_date),
|
|
574
|
-
"max": str(max_date),
|
|
575
|
-
}
|
|
576
|
-
}
|
|
577
|
-
}
|
|
578
|
-
col_profile.update(col_profile_stats)
|
|
579
|
-
|
|
580
|
-
#
|
|
581
|
-
# Boolean columns
|
|
582
|
-
#
|
|
583
|
-
elif "bool" in dtype_str.lower():
|
|
584
|
-
col_profile_additional = {
|
|
585
|
-
"sample_data": sample_data,
|
|
586
|
-
}
|
|
587
|
-
col_profile.update(col_profile_additional)
|
|
588
|
-
|
|
589
|
-
n_true_values = _to_df_lib(col_data.cast(int).sum(), df_lib=df_lib)
|
|
590
|
-
f_true_values = _round_to_sig_figs(n_true_values / row_count, 3)
|
|
591
|
-
|
|
592
|
-
n_false_values = row_count - n_missing_vals - n_true_values
|
|
593
|
-
f_false_values = _round_to_sig_figs(n_false_values / row_count, 3)
|
|
594
|
-
|
|
595
|
-
col_profile_stats = {
|
|
596
|
-
"statistics": {
|
|
597
|
-
"boolean": {
|
|
598
|
-
"n_true_values": n_true_values,
|
|
599
|
-
"f_true_values": f_true_values,
|
|
600
|
-
"n_false_values": n_false_values,
|
|
601
|
-
"f_false_values": f_false_values,
|
|
602
|
-
}
|
|
603
|
-
}
|
|
604
|
-
}
|
|
605
|
-
col_profile.update(col_profile_stats)
|
|
606
|
-
|
|
607
|
-
profile["columns"].append(col_profile)
|
|
608
|
-
|
|
609
|
-
return profile
|
|
610
|
-
|
|
611
|
-
def get_tabular_report(self) -> GT:
|
|
612
|
-
column_data = self.profile["columns"]
|
|
613
|
-
|
|
614
|
-
tbl_name = self.tbl_name
|
|
146
|
+
raise ImportError(msg)
|
|
147
|
+
as_native = nw.from_native(valid_native)
|
|
615
148
|
|
|
616
|
-
|
|
617
|
-
datetime_row_list = []
|
|
149
|
+
self.nw_data: Frame = nw.from_native(as_native)
|
|
618
150
|
|
|
619
|
-
|
|
620
|
-
|
|
151
|
+
self.tbl_name: str | None = tbl_name
|
|
152
|
+
self.profile: _DataProfile = self._generate_profile_df()
|
|
621
153
|
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
if "statistics" in col and "numerical" in col["statistics"]:
|
|
625
|
-
col_dict = _process_numerical_column_data(col)
|
|
626
|
-
elif "statistics" in col and "string_lengths" in col["statistics"]:
|
|
627
|
-
col_dict = _process_string_column_data(col)
|
|
628
|
-
elif "statistics" in col and "datetime" in col["statistics"]:
|
|
629
|
-
col_dict = _process_datetime_column_data(col)
|
|
630
|
-
datetime_row_list.append(idx)
|
|
631
|
-
elif "statistics" in col and "boolean" in col["statistics"]:
|
|
632
|
-
col_dict = _process_boolean_column_data(col)
|
|
633
|
-
else:
|
|
634
|
-
col_dict = _process_other_column_data(col)
|
|
154
|
+
def _generate_profile_df(self) -> _DataProfile:
|
|
155
|
+
columns: list[str] = self.nw_data.columns
|
|
635
156
|
|
|
636
|
-
|
|
157
|
+
profile = _DataProfile(
|
|
158
|
+
table_name=self.tbl_name,
|
|
159
|
+
columns=columns,
|
|
160
|
+
implementation=self.nw_data.implementation,
|
|
161
|
+
)
|
|
162
|
+
schema: Mapping[str, Any] = self.nw_data.schema
|
|
163
|
+
for column in columns:
|
|
164
|
+
col_data: DataFrame = self.nw_data.select(column)
|
|
165
|
+
|
|
166
|
+
## Handle dtyping:
|
|
167
|
+
native_dtype = schema[column]
|
|
168
|
+
if _TypeMap.is_illegal(native_dtype):
|
|
169
|
+
continue
|
|
170
|
+
try:
|
|
171
|
+
prof: type[ColumnProfile] = _TypeMap.fetch_profile(native_dtype)
|
|
172
|
+
except NotImplementedError:
|
|
173
|
+
continue
|
|
174
|
+
|
|
175
|
+
col_profile = ColumnProfile(colname=column, coltype=native_dtype)
|
|
176
|
+
|
|
177
|
+
## Collect Sample Data:
|
|
178
|
+
## This is the most consistent way (i think) to get the samples out of the data.
|
|
179
|
+
## We can avoid writing our own logic to determine operations and rely on narwhals.
|
|
180
|
+
raw_vals: list[Any] = (
|
|
181
|
+
_as_physical(col_data.drop_nulls().head(5)).to_dict()[column].to_list()
|
|
182
|
+
)
|
|
183
|
+
col_profile.sample_data = [str(x) for x in raw_vals]
|
|
637
184
|
|
|
638
|
-
|
|
639
|
-
# based on the available library
|
|
640
|
-
df_lib = _select_df_lib(preference="polars")
|
|
641
|
-
df_lib_str = str(df_lib)
|
|
185
|
+
col_profile.calc_stats(col_data)
|
|
642
186
|
|
|
643
|
-
|
|
644
|
-
|
|
187
|
+
sub_profile: ColumnProfile = col_profile.spawn_profile(prof)
|
|
188
|
+
sub_profile.calc_stats(col_data)
|
|
645
189
|
|
|
646
|
-
|
|
647
|
-
else:
|
|
648
|
-
import pandas as pd
|
|
190
|
+
profile.column_profiles.append(sub_profile)
|
|
649
191
|
|
|
650
|
-
|
|
192
|
+
profile.set_row_count(self.nw_data)
|
|
651
193
|
|
|
652
|
-
|
|
194
|
+
return profile
|
|
653
195
|
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
"mean",
|
|
658
|
-
"std_dev",
|
|
659
|
-
"min",
|
|
660
|
-
"p05",
|
|
661
|
-
"q_1",
|
|
662
|
-
"med",
|
|
663
|
-
"q_3",
|
|
664
|
-
"p95",
|
|
665
|
-
"max",
|
|
666
|
-
"iqr",
|
|
667
|
-
]
|
|
196
|
+
@property
|
|
197
|
+
def summary_data(self) -> IntoFrameT:
|
|
198
|
+
return self.profile.as_dataframe(strict=False).to_native()
|
|
668
199
|
|
|
200
|
+
def get_tabular_report(self, *, show_sample_data: bool = False) -> GT:
|
|
669
201
|
# Create the label, table type, and thresholds HTML fragments
|
|
670
202
|
table_type_html = _create_table_type_html(
|
|
671
|
-
tbl_type=self.
|
|
203
|
+
tbl_type=str(self.profile.implementation), tbl_name=self.tbl_name, font_size="10px"
|
|
672
204
|
)
|
|
673
205
|
|
|
674
|
-
tbl_dims_html = _create_table_dims_html(
|
|
206
|
+
tbl_dims_html = _create_table_dims_html(
|
|
207
|
+
columns=len(self.profile.columns), rows=self.profile.row_count, font_size="10px"
|
|
208
|
+
)
|
|
675
209
|
|
|
676
210
|
# Compose the subtitle HTML fragment
|
|
677
211
|
combined_title = (
|
|
@@ -685,113 +219,272 @@ class DataScan:
|
|
|
685
219
|
|
|
686
220
|
# TODO: Ensure width is 905px in total
|
|
687
221
|
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
.
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
222
|
+
data: DataFrame = self.profile.as_dataframe(strict=False)
|
|
223
|
+
|
|
224
|
+
## Remove all null columns:
|
|
225
|
+
all_null: list[str] = []
|
|
226
|
+
for stat_name in data.iter_columns():
|
|
227
|
+
col_len = len(stat_name.drop_nulls())
|
|
228
|
+
if col_len == 0:
|
|
229
|
+
all_null.append(stat_name.name)
|
|
230
|
+
data = data.drop(all_null)
|
|
231
|
+
|
|
232
|
+
if not show_sample_data:
|
|
233
|
+
data = data.drop("sample_data")
|
|
234
|
+
|
|
235
|
+
# find what stat cols were used in the analysis
|
|
236
|
+
non_stat_cols = ("icon", "colname") # TODO: need a better place for this
|
|
237
|
+
present_stat_cols: set[str] = set(data.columns) - set(non_stat_cols)
|
|
238
|
+
present_stat_cols.remove("coltype")
|
|
239
|
+
with contextlib.suppress(KeyError):
|
|
240
|
+
present_stat_cols.remove("freqs") # TODO: currently used for html but no displayed?
|
|
241
|
+
|
|
242
|
+
## Assemble the target order and find what columns need borders.
|
|
243
|
+
## Borders should be placed to divide the stat "groups" and create a
|
|
244
|
+
## generally more aesthetically pleasing experience.
|
|
245
|
+
target_order: list[str] = list(non_stat_cols)
|
|
246
|
+
right_border_cols: list[str] = [non_stat_cols[-1]]
|
|
247
|
+
|
|
248
|
+
last_group: StatGroup = COLUMN_ORDER_REGISTRY[0].group
|
|
249
|
+
for col in COLUMN_ORDER_REGISTRY:
|
|
250
|
+
if col.name in present_stat_cols:
|
|
251
|
+
cur_group: StatGroup = col.group
|
|
252
|
+
target_order.append(col.name)
|
|
253
|
+
|
|
254
|
+
start_new_group: bool = last_group != cur_group
|
|
255
|
+
if start_new_group:
|
|
256
|
+
last_group = cur_group
|
|
257
|
+
last_col_added = target_order[-2] # -2 since we don't include the current
|
|
258
|
+
right_border_cols.append(last_col_added)
|
|
259
|
+
|
|
260
|
+
right_border_cols.append(target_order[-1]) # add border to last stat col
|
|
261
|
+
|
|
262
|
+
label_map: dict[str, Any] = self._build_label_map(target_order)
|
|
263
|
+
|
|
264
|
+
## Final Formatting:
|
|
265
|
+
formatted_data = data.with_columns(
|
|
266
|
+
colname=nw.concat_str(
|
|
267
|
+
nw.lit(
|
|
268
|
+
"<div style='font-size: 13px; white-space: nowrap; text-overflow: ellipsis; overflow: hidden;'>"
|
|
269
|
+
),
|
|
270
|
+
nw.col("colname"),
|
|
271
|
+
nw.lit("</div><div style='font-size: 11px; color: gray;'>"),
|
|
272
|
+
nw.col("coltype"),
|
|
273
|
+
nw.lit("</div>"),
|
|
274
|
+
),
|
|
275
|
+
__frac_n_unique=nw.col("n_unique") / nw.lit(self.profile.row_count),
|
|
276
|
+
__frac_n_missing=nw.col("n_missing") / nw.lit(self.profile.row_count),
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
## Pull out type indicies:
|
|
280
|
+
# TODO: The stat types should get an enum? or something?
|
|
281
|
+
# TODO: This all assumes the dates are separated by dashes, is that even true?
|
|
282
|
+
# TODO: This all assumes date_stats are strings already, not ints or anything else.
|
|
283
|
+
any_dates: bool = formatted_data.select(
|
|
284
|
+
__tmp_idx=nw.col("coltype").str.contains("Date", literal=True)
|
|
285
|
+
)["__tmp_idx"].any()
|
|
286
|
+
if any_dates:
|
|
287
|
+
date_stats = [c for c in present_stat_cols if c in ("min", "max")]
|
|
288
|
+
|
|
289
|
+
formatted_data = formatted_data.with_columns(
|
|
290
|
+
nw.when(nw.col("coltype").str.contains(r"\bDate\b", literal=False))
|
|
291
|
+
.then(nw.col(c).cast(nw.String).str.replace_all("-", "<br>"))
|
|
292
|
+
.otherwise(nw.col(c).cast(nw.String))
|
|
293
|
+
for c in date_stats
|
|
697
294
|
)
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
295
|
+
|
|
296
|
+
any_datetimes: bool = formatted_data.select(
|
|
297
|
+
__tmp_idx=nw.col("coltype").str.contains("Datetime", literal=True)
|
|
298
|
+
)["__tmp_idx"].any()
|
|
299
|
+
if any_datetimes:
|
|
300
|
+
datetime_idx = [c for c in present_stat_cols if c in ("min", "max")]
|
|
301
|
+
formatted_data = formatted_data.with_columns(
|
|
302
|
+
nw.when(nw.col("coltype").str.contains(r"\bDatetime\b", literal=False))
|
|
303
|
+
.then(nw.col(c).cast(nw.String).str.replace_all("-", "<br>"))
|
|
304
|
+
.otherwise(nw.col(c).cast(nw.String))
|
|
305
|
+
for c in datetime_idx
|
|
701
306
|
)
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
307
|
+
|
|
308
|
+
# format fractions:
|
|
309
|
+
# this is an anti-pattern but there's no serious alternative
|
|
310
|
+
for _fmt_col in ("__frac_n_unique", "__frac_n_missing"):
|
|
311
|
+
_formatted: list[str | None] = _fmt_frac(formatted_data[_fmt_col])
|
|
312
|
+
formatted: nw.Series = nw.new_series(
|
|
313
|
+
_fmt_col, values=_formatted, backend=self.profile.implementation
|
|
705
314
|
)
|
|
706
|
-
.
|
|
707
|
-
|
|
708
|
-
|
|
315
|
+
formatted_data = formatted_data.drop(_fmt_col)
|
|
316
|
+
formatted_data = formatted_data.with_columns(formatted.alias(_fmt_col))
|
|
317
|
+
|
|
318
|
+
formatted_data = (
|
|
319
|
+
# TODO: This is a temporary solution?
|
|
320
|
+
# Format the unique and missing pct strings
|
|
321
|
+
formatted_data.with_columns(
|
|
322
|
+
n_unique=nw.concat_str(
|
|
323
|
+
nw.col("n_unique"),
|
|
324
|
+
nw.lit("<br>"),
|
|
325
|
+
nw.col("__frac_n_unique"),
|
|
326
|
+
),
|
|
327
|
+
n_missing=nw.concat_str(
|
|
328
|
+
nw.col("n_missing"),
|
|
329
|
+
nw.lit("<br>"),
|
|
330
|
+
nw.col("__frac_n_missing"),
|
|
331
|
+
),
|
|
709
332
|
)
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
333
|
+
# TODO: Should be able to use selectors for this
|
|
334
|
+
.drop("__frac_n_unique", "__frac_n_missing", "coltype")
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
if "freqs" in formatted_data.columns: # TODO: don't love this arbitrary check
|
|
338
|
+
# Extract HTML freqs:
|
|
339
|
+
try:
|
|
340
|
+
formatted_data = formatted_data.with_columns(
|
|
341
|
+
__freq_true=nw.col("freqs").struct.field("True"),
|
|
342
|
+
__freq_false=nw.col("freqs").struct.field("False"),
|
|
343
|
+
)
|
|
344
|
+
except Exception: # TODO: should be narrowed if possible
|
|
345
|
+
# if no struct implimentation exists, it must be done manually
|
|
346
|
+
freq_ser: nw.Series = formatted_data["freqs"]
|
|
347
|
+
trues: list[int | None] = []
|
|
348
|
+
falses: list[int | None] = []
|
|
349
|
+
for freq in freq_ser:
|
|
350
|
+
try:
|
|
351
|
+
trues.append(freq["True"])
|
|
352
|
+
falses.append(freq["False"])
|
|
353
|
+
except (KeyError, TypeError):
|
|
354
|
+
trues.append(None)
|
|
355
|
+
falses.append(None)
|
|
356
|
+
true_ser: nw.Series = nw.new_series(
|
|
357
|
+
name="__freq_true", values=trues, backend=self.profile.implementation
|
|
358
|
+
)
|
|
359
|
+
false_ser: nw.Series = nw.new_series(
|
|
360
|
+
name="__freq_false", values=falses, backend=self.profile.implementation
|
|
361
|
+
)
|
|
362
|
+
formatted_data = formatted_data.with_columns(
|
|
363
|
+
__freq_true=true_ser, __freq_false=false_ser
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
## format pct true values
|
|
367
|
+
formatted_data = formatted_data.with_columns(
|
|
368
|
+
# for bools, UQs are represented as percentages
|
|
369
|
+
__pct_true=nw.col("__freq_true") / self.profile.row_count,
|
|
370
|
+
__pct_false=nw.col("__freq_false") / self.profile.row_count,
|
|
713
371
|
)
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
372
|
+
for _fmt_col in ("__pct_true", "__pct_false"):
|
|
373
|
+
_formatted: list[str | None] = _fmt_frac(formatted_data[_fmt_col])
|
|
374
|
+
formatted = nw.new_series(
|
|
375
|
+
name=_fmt_col, values=_formatted, backend=self.profile.implementation
|
|
376
|
+
)
|
|
377
|
+
formatted_data = formatted_data.drop(_fmt_col)
|
|
378
|
+
formatted_data = formatted_data.with_columns(formatted.alias(_fmt_col))
|
|
379
|
+
|
|
380
|
+
formatted_data = (
|
|
381
|
+
formatted_data.with_columns(
|
|
382
|
+
__bool_unique_html=nw.concat_str(
|
|
383
|
+
nw.lit("<span style='font-weight: bold;'>T</span>"),
|
|
384
|
+
nw.col("__pct_true"),
|
|
385
|
+
nw.lit("<br><span style='font-weight: bold;'>F</span>"),
|
|
386
|
+
nw.col("__pct_false"),
|
|
387
|
+
),
|
|
388
|
+
)
|
|
389
|
+
.with_columns(
|
|
390
|
+
n_unique=nw.when(~nw.col("__bool_unique_html").is_null())
|
|
391
|
+
.then(nw.col("__bool_unique_html"))
|
|
392
|
+
.otherwise(nw.col("n_unique"))
|
|
393
|
+
)
|
|
394
|
+
.drop(
|
|
395
|
+
"__freq_true",
|
|
396
|
+
"__freq_false",
|
|
397
|
+
"__bool_unique_html",
|
|
398
|
+
"freqs",
|
|
399
|
+
"__pct_true",
|
|
400
|
+
"__pct_false",
|
|
401
|
+
)
|
|
717
402
|
)
|
|
403
|
+
|
|
404
|
+
## Determine Value Formatting Selectors:
|
|
405
|
+
fmt_int: list[str] = formatted_data.select(nw.selectors.by_dtype(nw.dtypes.Int64)).columns
|
|
406
|
+
fmt_float: list[str] = formatted_data.select(
|
|
407
|
+
nw.selectors.by_dtype(nw.dtypes.Float64)
|
|
408
|
+
).columns
|
|
409
|
+
|
|
410
|
+
## GT Table:
|
|
411
|
+
gt_tbl = (
|
|
412
|
+
GT(formatted_data.to_native())
|
|
413
|
+
.tab_header(title=html(combined_title))
|
|
414
|
+
.tab_source_note(source_note="String columns statistics regard the string's length.")
|
|
415
|
+
.cols_align(align="right", columns=list(present_stat_cols))
|
|
416
|
+
.opt_table_font(font=google_font("IBM Plex Sans"))
|
|
417
|
+
.opt_align_table_header(align="left")
|
|
418
|
+
.tab_style(style=style.text(font=google_font("IBM Plex Mono")), locations=loc.body())
|
|
419
|
+
.cols_move_to_start(target_order)
|
|
420
|
+
## Labeling
|
|
421
|
+
.cols_label(label_map)
|
|
422
|
+
.cols_label(icon="", colname="Column")
|
|
423
|
+
.cols_align("center", columns=list(present_stat_cols))
|
|
718
424
|
.tab_style(
|
|
719
|
-
style=style.
|
|
720
|
-
locations=loc.body(columns=["std_dev", "p05", "q_1", "med", "q_3", "p95", "max"]),
|
|
425
|
+
style=style.text(align="right"), locations=loc.body(columns=list(present_stat_cols))
|
|
721
426
|
)
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
427
|
+
## Value Formatting
|
|
428
|
+
.fmt_integer(columns=fmt_int)
|
|
429
|
+
.fmt_number(
|
|
430
|
+
columns=fmt_float,
|
|
431
|
+
decimals=2,
|
|
432
|
+
drop_trailing_dec_mark=True,
|
|
433
|
+
drop_trailing_zeros=True,
|
|
728
434
|
)
|
|
435
|
+
## Borders
|
|
729
436
|
.tab_style(
|
|
730
|
-
style=style.
|
|
731
|
-
locations=loc.body(columns=
|
|
437
|
+
style=style.borders(sides="right", color="#D3D3D3", style="solid"),
|
|
438
|
+
locations=loc.body(columns=right_border_cols),
|
|
732
439
|
)
|
|
733
440
|
.tab_style(
|
|
734
|
-
style=style.
|
|
441
|
+
style=style.borders(sides="left", color="#E5E5E5", style="dashed"),
|
|
442
|
+
locations=loc.body(columns=list(present_stat_cols)),
|
|
735
443
|
)
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
missing_vals="NA",
|
|
741
|
-
unique_vals="UQ",
|
|
742
|
-
mean="Mean",
|
|
743
|
-
std_dev="SD",
|
|
744
|
-
min="Min",
|
|
745
|
-
p05=html(
|
|
746
|
-
'P<span style="font-size: 0.75em; vertical-align: sub; position: relative; line-height: 0.5em;">5</span>'
|
|
747
|
-
),
|
|
748
|
-
q_1=html(
|
|
749
|
-
'Q<span style="font-size: 0.75em; vertical-align: sub; position: relative; line-height: 0.5em;">1</span>'
|
|
750
|
-
),
|
|
751
|
-
med="Med",
|
|
752
|
-
q_3=html(
|
|
753
|
-
'Q<span style="font-size: 0.75em; vertical-align: sub; position: relative; line-height: 0.5em;">3</span>'
|
|
754
|
-
),
|
|
755
|
-
p95=html(
|
|
756
|
-
'P<span style="font-size: 0.75em; vertical-align: sub; position: relative; line-height: 0.5em;">95</span>'
|
|
757
|
-
),
|
|
758
|
-
max="Max",
|
|
759
|
-
iqr="IQR",
|
|
444
|
+
## Formatting
|
|
445
|
+
.tab_style(
|
|
446
|
+
style=style.text(size="10px"),
|
|
447
|
+
locations=loc.body(columns=list(present_stat_cols)),
|
|
760
448
|
)
|
|
449
|
+
.tab_style(style=style.text(size="12px"), locations=loc.body(columns="colname"))
|
|
761
450
|
.cols_width(
|
|
762
|
-
|
|
763
|
-
icon="35px",
|
|
764
|
-
column_name="200px",
|
|
765
|
-
missing_vals="50px",
|
|
766
|
-
unique_vals="50px",
|
|
767
|
-
mean="50px",
|
|
768
|
-
std_dev="50px",
|
|
769
|
-
min="50px",
|
|
770
|
-
p05="50px",
|
|
771
|
-
q_1="50px",
|
|
772
|
-
med="50px",
|
|
773
|
-
q_3="50px",
|
|
774
|
-
p95="50px",
|
|
775
|
-
max="50px",
|
|
776
|
-
iqr="50px", # 875 px total
|
|
451
|
+
icon="35px", colname="200px", **{stat_col: "60px" for stat_col in present_stat_cols}
|
|
777
452
|
)
|
|
778
453
|
)
|
|
779
454
|
|
|
455
|
+
if "PYARROW" != formatted_data.implementation.name:
|
|
456
|
+
# TODO: this is more proactive than it should be
|
|
457
|
+
gt_tbl = gt_tbl.sub_missing(missing_text="-")
|
|
458
|
+
# https://github.com/posit-dev/great-tables/issues/667
|
|
459
|
+
|
|
780
460
|
# If the version of `great_tables` is `>=0.17.0` then disable Quarto table processing
|
|
781
461
|
if version("great_tables") >= "0.17.0":
|
|
782
462
|
gt_tbl = gt_tbl.tab_options(quarto_disable_processing=True)
|
|
783
463
|
|
|
784
464
|
return gt_tbl
|
|
785
465
|
|
|
786
|
-
|
|
787
|
-
|
|
466
|
+
@staticmethod
|
|
467
|
+
def _build_label_map(cols: Sequence[str]) -> dict[str, Any]:
|
|
468
|
+
label_map: dict[str, Any] = {}
|
|
469
|
+
for target_col in cols:
|
|
470
|
+
try:
|
|
471
|
+
matching_stat = next(
|
|
472
|
+
stat for stat in COLUMN_ORDER_REGISTRY if target_col == stat.name
|
|
473
|
+
)
|
|
474
|
+
except StopIteration:
|
|
475
|
+
continue
|
|
476
|
+
label_map[target_col] = matching_stat.label
|
|
477
|
+
return label_map
|
|
788
478
|
|
|
789
479
|
def to_json(self) -> str:
|
|
790
|
-
|
|
480
|
+
prof_dict = self.profile.as_dataframe(strict=False).to_dict(as_series=False)
|
|
481
|
+
|
|
482
|
+
return json.dumps(prof_dict, indent=4, default=str)
|
|
791
483
|
|
|
792
484
|
def save_to_json(self, output_file: str):
|
|
485
|
+
json_string: str = self.to_json()
|
|
793
486
|
with open(output_file, "w") as f:
|
|
794
|
-
json.dump(
|
|
487
|
+
json.dump(json_string, f, indent=4)
|
|
795
488
|
|
|
796
489
|
|
|
797
490
|
def col_summary_tbl(data: FrameT | Any, tbl_name: str | None = None) -> GT:
|
|
@@ -875,337 +568,3 @@ def col_summary_tbl(data: FrameT | Any, tbl_name: str | None = None) -> GT:
|
|
|
875
568
|
|
|
876
569
|
scanner = DataScan(data=data, tbl_name=tbl_name)
|
|
877
570
|
return scanner.get_tabular_report()
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
def _to_df_lib(expr: any, df_lib: str) -> any:
|
|
881
|
-
if df_lib == "polars":
|
|
882
|
-
return expr.to_polars()
|
|
883
|
-
else:
|
|
884
|
-
return expr.to_pandas()
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
def _round_to_sig_figs(value: float, sig_figs: int) -> float:
|
|
888
|
-
if value == 0:
|
|
889
|
-
return 0
|
|
890
|
-
return round(value, sig_figs - int(floor(log10(abs(value)))) - 1)
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
def _compact_integer_fmt(value: float | int) -> str:
|
|
894
|
-
if value == 0:
|
|
895
|
-
formatted = "0"
|
|
896
|
-
elif abs(value) >= 1 and abs(value) < 10_000:
|
|
897
|
-
formatted = fmt_integer(value, use_seps=False)[0]
|
|
898
|
-
else:
|
|
899
|
-
formatted = fmt_scientific(value, decimals=1, exp_style="E1")[0]
|
|
900
|
-
|
|
901
|
-
return formatted
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
def _compact_decimal_fmt(value: float | int) -> str:
|
|
905
|
-
if value == 0:
|
|
906
|
-
formatted = "0.00"
|
|
907
|
-
elif abs(value) < 1 and abs(value) >= 0.01:
|
|
908
|
-
formatted = fmt_number(value, decimals=2)[0]
|
|
909
|
-
elif abs(value) < 0.01:
|
|
910
|
-
formatted = fmt_scientific(value, decimals=1, exp_style="E1")[0]
|
|
911
|
-
elif abs(value) >= 1 and abs(value) < 10:
|
|
912
|
-
formatted = fmt_number(value, decimals=2, use_seps=False)[0]
|
|
913
|
-
elif abs(value) >= 10 and abs(value) < 1000:
|
|
914
|
-
formatted = fmt_number(value, n_sigfig=3)[0]
|
|
915
|
-
elif abs(value) >= 1000 and abs(value) < 10_000:
|
|
916
|
-
formatted = fmt_number(value, n_sigfig=4, use_seps=False)[0]
|
|
917
|
-
else:
|
|
918
|
-
formatted = fmt_scientific(value, decimals=1, exp_style="E1")[0]
|
|
919
|
-
|
|
920
|
-
return formatted
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
def _compact_0_1_fmt(value: float | int) -> str:
|
|
924
|
-
if value == 0:
|
|
925
|
-
formatted = " 0.00"
|
|
926
|
-
elif value == 1:
|
|
927
|
-
formatted = " 1.00"
|
|
928
|
-
elif abs(value) < 0.01:
|
|
929
|
-
formatted = "<0.01"
|
|
930
|
-
elif abs(value) > 0.99 and abs(value) < 1.0:
|
|
931
|
-
formatted = ">0.99"
|
|
932
|
-
elif abs(value) <= 0.99 and abs(value) >= 0.01:
|
|
933
|
-
formatted = " " + fmt_number(value, decimals=2)[0]
|
|
934
|
-
else:
|
|
935
|
-
formatted = fmt_number(value, n_sigfig=3)[0]
|
|
936
|
-
return formatted
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
def _process_numerical_column_data(column_data: dict) -> dict:
|
|
940
|
-
column_number = column_data["column_number"]
|
|
941
|
-
column_name = column_data["column_name"]
|
|
942
|
-
column_type = column_data["column_type"]
|
|
943
|
-
|
|
944
|
-
column_name_and_type = (
|
|
945
|
-
f"<div style='font-size: 13px; white-space: nowrap; text-overflow: ellipsis; overflow: hidden;'>{column_name}</div>"
|
|
946
|
-
f"<div style='font-size: 11px; color: gray;'>{column_type}</div>"
|
|
947
|
-
)
|
|
948
|
-
|
|
949
|
-
# Get the Missing and Unique value counts and fractions
|
|
950
|
-
missing_vals = column_data["n_missing_values"]
|
|
951
|
-
unique_vals = column_data["n_unique_values"]
|
|
952
|
-
missing_vals_frac = _compact_0_1_fmt(column_data["f_missing_values"])
|
|
953
|
-
unique_vals_frac = _compact_0_1_fmt(column_data["f_unique_values"])
|
|
954
|
-
|
|
955
|
-
missing_vals_str = f"{missing_vals}<br>{missing_vals_frac}"
|
|
956
|
-
unique_vals_str = f"{unique_vals}<br>{unique_vals_frac}"
|
|
957
|
-
|
|
958
|
-
# Get the descriptive and quantile statistics
|
|
959
|
-
descriptive_stats = column_data["statistics"]["numerical"]["descriptive"]
|
|
960
|
-
quantile_stats = column_data["statistics"]["numerical"]["quantiles"]
|
|
961
|
-
|
|
962
|
-
# Get all values from the descriptive and quantile stats into a single list
|
|
963
|
-
quantile_stats_vals = [v[1] for v in quantile_stats.items()]
|
|
964
|
-
|
|
965
|
-
# Determine if the quantile stats are all integerlike
|
|
966
|
-
integerlike = []
|
|
967
|
-
|
|
968
|
-
# Determine if the quantile stats are integerlike
|
|
969
|
-
for val in quantile_stats_vals:
|
|
970
|
-
# Check if a quantile value is a number and then if it is intergerlike
|
|
971
|
-
if not isinstance(val, (int, float)):
|
|
972
|
-
continue # pragma: no cover
|
|
973
|
-
else:
|
|
974
|
-
integerlike.append(val % 1 == 0)
|
|
975
|
-
quantile_vals_integerlike = all(integerlike)
|
|
976
|
-
|
|
977
|
-
# Determine the formatter to use for the quantile values
|
|
978
|
-
if quantile_vals_integerlike:
|
|
979
|
-
q_formatter = _compact_integer_fmt
|
|
980
|
-
else:
|
|
981
|
-
q_formatter = _compact_decimal_fmt
|
|
982
|
-
|
|
983
|
-
# Format the descriptive statistics (mean and standard deviation)
|
|
984
|
-
for key, value in descriptive_stats.items():
|
|
985
|
-
descriptive_stats[key] = _compact_decimal_fmt(value=value)
|
|
986
|
-
|
|
987
|
-
# Format the quantile statistics
|
|
988
|
-
for key, value in quantile_stats.items():
|
|
989
|
-
quantile_stats[key] = q_formatter(value=value)
|
|
990
|
-
|
|
991
|
-
# Create a single dictionary with the statistics for the column
|
|
992
|
-
stats_dict = {
|
|
993
|
-
"column_number": column_number,
|
|
994
|
-
"icon": SVG_ICONS_FOR_DATA_TYPES["numeric"],
|
|
995
|
-
"column_name": column_name_and_type,
|
|
996
|
-
"missing_vals": missing_vals_str,
|
|
997
|
-
"unique_vals": unique_vals_str,
|
|
998
|
-
**descriptive_stats,
|
|
999
|
-
**quantile_stats,
|
|
1000
|
-
}
|
|
1001
|
-
|
|
1002
|
-
return stats_dict
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
def _process_string_column_data(column_data: dict) -> dict:
|
|
1006
|
-
column_number = column_data["column_number"]
|
|
1007
|
-
column_name = column_data["column_name"]
|
|
1008
|
-
column_type = column_data["column_type"]
|
|
1009
|
-
|
|
1010
|
-
column_name_and_type = (
|
|
1011
|
-
f"<div style='font-size: 13px; white-space: nowrap; text-overflow: ellipsis; overflow: hidden;'>{column_name}</div>"
|
|
1012
|
-
f"<div style='font-size: 11px; color: gray;'>{column_type}</div>"
|
|
1013
|
-
)
|
|
1014
|
-
|
|
1015
|
-
# Get the Missing and Unique value counts and fractions
|
|
1016
|
-
missing_vals = column_data["n_missing_values"]
|
|
1017
|
-
unique_vals = column_data["n_unique_values"]
|
|
1018
|
-
missing_vals_frac = _compact_0_1_fmt(column_data["f_missing_values"])
|
|
1019
|
-
unique_vals_frac = _compact_0_1_fmt(column_data["f_unique_values"])
|
|
1020
|
-
|
|
1021
|
-
missing_vals_str = f"{missing_vals}<br>{missing_vals_frac}"
|
|
1022
|
-
unique_vals_str = f"{unique_vals}<br>{unique_vals_frac}"
|
|
1023
|
-
|
|
1024
|
-
# Get the descriptive and quantile statistics
|
|
1025
|
-
descriptive_stats = column_data["statistics"]["string_lengths"]["descriptive"]
|
|
1026
|
-
quantile_stats = column_data["statistics"]["string_lengths"]["quantiles"]
|
|
1027
|
-
|
|
1028
|
-
# Format the descriptive statistics (mean and standard deviation)
|
|
1029
|
-
for key, value in descriptive_stats.items():
|
|
1030
|
-
formatted_val = _compact_decimal_fmt(value=value)
|
|
1031
|
-
descriptive_stats[key] = (
|
|
1032
|
-
f'<div><div>{formatted_val}</div><div style="float: left; position: absolute;">'
|
|
1033
|
-
'<div title="string length measure" style="font-size: 7px; color: #999; '
|
|
1034
|
-
'font-style: italic; cursor: help;">SL</div></div></div>'
|
|
1035
|
-
)
|
|
1036
|
-
|
|
1037
|
-
# Format the quantile statistics
|
|
1038
|
-
for key, value in quantile_stats.items():
|
|
1039
|
-
formatted_val = _compact_integer_fmt(value=value)
|
|
1040
|
-
quantile_stats[key] = (
|
|
1041
|
-
f'<div><div>{formatted_val}</div><div style="float: left; position: absolute;">'
|
|
1042
|
-
'<div title="string length measure" style="font-size: 7px; color: #999; '
|
|
1043
|
-
'font-style: italic; cursor: help;">SL</div></div></div>'
|
|
1044
|
-
)
|
|
1045
|
-
|
|
1046
|
-
# Create a single dictionary with the statistics for the column
|
|
1047
|
-
stats_dict = {
|
|
1048
|
-
"column_number": column_number,
|
|
1049
|
-
"icon": SVG_ICONS_FOR_DATA_TYPES["string"],
|
|
1050
|
-
"column_name": column_name_and_type,
|
|
1051
|
-
"missing_vals": missing_vals_str,
|
|
1052
|
-
"unique_vals": unique_vals_str,
|
|
1053
|
-
**descriptive_stats,
|
|
1054
|
-
"min": quantile_stats["min"],
|
|
1055
|
-
"p05": "—",
|
|
1056
|
-
"q_1": "—",
|
|
1057
|
-
"med": quantile_stats["med"],
|
|
1058
|
-
"q_3": "—",
|
|
1059
|
-
"p95": "—",
|
|
1060
|
-
"max": quantile_stats["max"],
|
|
1061
|
-
"iqr": "—",
|
|
1062
|
-
}
|
|
1063
|
-
|
|
1064
|
-
return stats_dict
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
def _process_datetime_column_data(column_data: dict) -> dict:
|
|
1068
|
-
column_number = column_data["column_number"]
|
|
1069
|
-
column_name = column_data["column_name"]
|
|
1070
|
-
column_type = column_data["column_type"]
|
|
1071
|
-
|
|
1072
|
-
long_column_type = len(column_type) > 22
|
|
1073
|
-
|
|
1074
|
-
if long_column_type:
|
|
1075
|
-
column_type_style = "font-size: 7.5px; color: gray; margin-top: 3px; margin-bottom: 2px;"
|
|
1076
|
-
else:
|
|
1077
|
-
column_type_style = "font-size: 11px; color: gray;"
|
|
1078
|
-
|
|
1079
|
-
column_name_and_type = (
|
|
1080
|
-
f"<div style='font-size: 13px; white-space: nowrap; text-overflow: ellipsis; overflow: hidden;'>{column_name}</div>"
|
|
1081
|
-
f"<div style='{column_type_style}'>{column_type}</div>"
|
|
1082
|
-
)
|
|
1083
|
-
|
|
1084
|
-
# Get the Missing and Unique value counts and fractions
|
|
1085
|
-
missing_vals = column_data["n_missing_values"]
|
|
1086
|
-
unique_vals = column_data["n_unique_values"]
|
|
1087
|
-
missing_vals_frac = _compact_0_1_fmt(column_data["f_missing_values"])
|
|
1088
|
-
unique_vals_frac = _compact_0_1_fmt(column_data["f_unique_values"])
|
|
1089
|
-
|
|
1090
|
-
missing_vals_str = f"{missing_vals}<br>{missing_vals_frac}"
|
|
1091
|
-
unique_vals_str = f"{unique_vals}<br>{unique_vals_frac}"
|
|
1092
|
-
|
|
1093
|
-
# Get the min and max date
|
|
1094
|
-
min_date = column_data["statistics"]["datetime"]["min"]
|
|
1095
|
-
max_date = column_data["statistics"]["datetime"]["max"]
|
|
1096
|
-
|
|
1097
|
-
# Format the dates so that they don't break across lines
|
|
1098
|
-
min_max_date_str = f"<span style='text-align: left; white-space: nowrap; overflow-x: visible;'> {min_date} – {max_date}</span>"
|
|
1099
|
-
|
|
1100
|
-
# Create a single dictionary with the statistics for the column
|
|
1101
|
-
stats_dict = {
|
|
1102
|
-
"column_number": column_number,
|
|
1103
|
-
"icon": SVG_ICONS_FOR_DATA_TYPES["date"],
|
|
1104
|
-
"column_name": column_name_and_type,
|
|
1105
|
-
"missing_vals": missing_vals_str,
|
|
1106
|
-
"unique_vals": unique_vals_str,
|
|
1107
|
-
"mean": "—",
|
|
1108
|
-
"std_dev": "—",
|
|
1109
|
-
"min": min_max_date_str,
|
|
1110
|
-
"p05": "",
|
|
1111
|
-
"q_1": "",
|
|
1112
|
-
"med": "",
|
|
1113
|
-
"q_3": "",
|
|
1114
|
-
"p95": "",
|
|
1115
|
-
"max": "",
|
|
1116
|
-
"iqr": "—",
|
|
1117
|
-
}
|
|
1118
|
-
|
|
1119
|
-
return stats_dict
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
def _process_boolean_column_data(column_data: dict) -> dict:
|
|
1123
|
-
column_number = column_data["column_number"]
|
|
1124
|
-
column_name = column_data["column_name"]
|
|
1125
|
-
column_type = column_data["column_type"]
|
|
1126
|
-
|
|
1127
|
-
column_name_and_type = (
|
|
1128
|
-
f"<div style='font-size: 13px; white-space: nowrap; text-overflow: ellipsis; overflow: hidden;'>{column_name}</div>"
|
|
1129
|
-
f"<div style='font-size: 11px; color: gray;'>{column_type}</div>"
|
|
1130
|
-
)
|
|
1131
|
-
|
|
1132
|
-
# Get the missing value count and fraction
|
|
1133
|
-
missing_vals = column_data["n_missing_values"]
|
|
1134
|
-
missing_vals_frac = _compact_0_1_fmt(column_data["f_missing_values"])
|
|
1135
|
-
missing_vals_str = f"{missing_vals}<br>{missing_vals_frac}"
|
|
1136
|
-
|
|
1137
|
-
# Get the fractions of True and False values
|
|
1138
|
-
f_true_values = column_data["statistics"]["boolean"]["f_true_values"]
|
|
1139
|
-
f_false_values = column_data["statistics"]["boolean"]["f_false_values"]
|
|
1140
|
-
|
|
1141
|
-
true_vals_frac_fmt = _compact_0_1_fmt(f_true_values)
|
|
1142
|
-
false_vals_frac_fmt = _compact_0_1_fmt(f_false_values)
|
|
1143
|
-
|
|
1144
|
-
# Create an HTML string that combines fractions for the True and False values; this will be
|
|
1145
|
-
# used in the Unique Vals column of the report table
|
|
1146
|
-
true_false_vals_str = (
|
|
1147
|
-
f"<span style='font-weight: bold;'>T</span>{true_vals_frac_fmt}<br>"
|
|
1148
|
-
f"<span style='font-weight: bold;'>F</span>{false_vals_frac_fmt}"
|
|
1149
|
-
)
|
|
1150
|
-
|
|
1151
|
-
# Create a single dictionary with the statistics for the column
|
|
1152
|
-
stats_dict = {
|
|
1153
|
-
"column_number": column_number,
|
|
1154
|
-
"icon": SVG_ICONS_FOR_DATA_TYPES["boolean"],
|
|
1155
|
-
"column_name": column_name_and_type,
|
|
1156
|
-
"missing_vals": missing_vals_str,
|
|
1157
|
-
"unique_vals": true_false_vals_str,
|
|
1158
|
-
"mean": "—",
|
|
1159
|
-
"std_dev": "—",
|
|
1160
|
-
"min": "—",
|
|
1161
|
-
"p05": "—",
|
|
1162
|
-
"q_1": "—",
|
|
1163
|
-
"med": "—",
|
|
1164
|
-
"q_3": "—",
|
|
1165
|
-
"p95": "—",
|
|
1166
|
-
"max": "—",
|
|
1167
|
-
"iqr": "—",
|
|
1168
|
-
}
|
|
1169
|
-
|
|
1170
|
-
return stats_dict
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
def _process_other_column_data(column_data: dict) -> dict:
|
|
1174
|
-
column_number = column_data["column_number"]
|
|
1175
|
-
column_name = column_data["column_name"]
|
|
1176
|
-
column_type = column_data["column_type"]
|
|
1177
|
-
|
|
1178
|
-
column_name_and_type = (
|
|
1179
|
-
f"<div style='font-size: 13px; white-space: nowrap; text-overflow: ellipsis; overflow: hidden;'>{column_name}</div>"
|
|
1180
|
-
f"<div style='font-size: 11px; color: gray;'>{column_type}</div>"
|
|
1181
|
-
)
|
|
1182
|
-
|
|
1183
|
-
# Get the Missing and Unique value counts and fractions
|
|
1184
|
-
missing_vals = column_data["n_missing_values"]
|
|
1185
|
-
unique_vals = column_data["n_unique_values"]
|
|
1186
|
-
missing_vals_frac = _compact_decimal_fmt(column_data["f_missing_values"])
|
|
1187
|
-
unique_vals_frac = _compact_decimal_fmt(column_data["f_unique_values"])
|
|
1188
|
-
|
|
1189
|
-
missing_vals_str = f"{missing_vals}<br>{missing_vals_frac}"
|
|
1190
|
-
unique_vals_str = f"{unique_vals}<br>{unique_vals_frac}"
|
|
1191
|
-
|
|
1192
|
-
# Create a single dictionary with the statistics for the column
|
|
1193
|
-
stats_dict = {
|
|
1194
|
-
"column_number": column_number,
|
|
1195
|
-
"icon": SVG_ICONS_FOR_DATA_TYPES["object"],
|
|
1196
|
-
"column_name": column_name_and_type,
|
|
1197
|
-
"missing_vals": missing_vals_str,
|
|
1198
|
-
"unique_vals": unique_vals_str,
|
|
1199
|
-
"mean": "—",
|
|
1200
|
-
"std_dev": "—",
|
|
1201
|
-
"min": "—",
|
|
1202
|
-
"p05": "—",
|
|
1203
|
-
"q_1": "—",
|
|
1204
|
-
"med": "—",
|
|
1205
|
-
"q_3": "—",
|
|
1206
|
-
"p95": "—",
|
|
1207
|
-
"max": "—",
|
|
1208
|
-
"iqr": "—",
|
|
1209
|
-
}
|
|
1210
|
-
|
|
1211
|
-
return stats_dict
|