pandas-plots 0.12.6__tar.gz → 0.12.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pandas_plots-0.12.6/src/pandas_plots.egg-info → pandas_plots-0.12.8}/PKG-INFO +1 -1
- {pandas_plots-0.12.6 → pandas_plots-0.12.8}/setup.cfg +1 -1
- {pandas_plots-0.12.6 → pandas_plots-0.12.8}/src/pandas_plots/pls.py +269 -118
- {pandas_plots-0.12.6 → pandas_plots-0.12.8}/src/pandas_plots/tbl.py +63 -23
- {pandas_plots-0.12.6 → pandas_plots-0.12.8/src/pandas_plots.egg-info}/PKG-INFO +1 -1
- {pandas_plots-0.12.6 → pandas_plots-0.12.8}/src/pandas_plots.egg-info/top_level.txt +1 -0
- {pandas_plots-0.12.6 → pandas_plots-0.12.8}/LICENSE +0 -0
- {pandas_plots-0.12.6 → pandas_plots-0.12.8}/README.md +0 -0
- {pandas_plots-0.12.6 → pandas_plots-0.12.8}/pyproject.toml +0 -0
- {pandas_plots-0.12.6 → pandas_plots-0.12.8}/src/pandas_plots/hlp.py +0 -0
- {pandas_plots-0.12.6 → pandas_plots-0.12.8}/src/pandas_plots/pii.py +0 -0
- {pandas_plots-0.12.6 → pandas_plots-0.12.8}/src/pandas_plots/ven.py +0 -0
- {pandas_plots-0.12.6 → pandas_plots-0.12.8}/src/pandas_plots.egg-info/SOURCES.txt +0 -0
- {pandas_plots-0.12.6 → pandas_plots-0.12.8}/src/pandas_plots.egg-info/dependency_links.txt +0 -0
- {pandas_plots-0.12.6 → pandas_plots-0.12.8}/src/pandas_plots.egg-info/pii.py +0 -0
- {pandas_plots-0.12.6 → pandas_plots-0.12.8}/src/pandas_plots.egg-info/requires.txt +0 -0
@@ -1,7 +1,4 @@
|
|
1
1
|
from pathlib import Path
|
2
|
-
import warnings
|
3
|
-
|
4
|
-
warnings.filterwarnings("ignore")
|
5
2
|
|
6
3
|
import os
|
7
4
|
from typing import Optional, Literal
|
@@ -12,50 +9,118 @@ from matplotlib import pyplot as plt
|
|
12
9
|
from plotly import express as px
|
13
10
|
import plotly.graph_objects as go
|
14
11
|
from plotly.subplots import make_subplots
|
12
|
+
import plotly # needed for return types
|
15
13
|
|
16
14
|
from .hlp import *
|
17
15
|
from .tbl import print_summary
|
18
16
|
|
19
17
|
### helper functions
|
20
18
|
|
19
|
+
|
21
20
|
def _set_caption(caption: str) -> str:
|
22
21
|
return f"#️⃣{'-'.join(caption.split())}, " if caption else ""
|
23
22
|
|
24
23
|
|
25
|
-
def aggregate_data(
|
24
|
+
def aggregate_data(
|
25
|
+
df: pd.DataFrame,
|
26
|
+
top_n_index: int,
|
27
|
+
top_n_color: int,
|
28
|
+
top_n_facet: int,
|
29
|
+
null_label: str,
|
30
|
+
show_other: bool = False,
|
31
|
+
sort_values_index: bool = False,
|
32
|
+
sort_values_color: bool = False,
|
33
|
+
sort_values_facet: bool = False,
|
34
|
+
) -> pd.DataFrame:
|
26
35
|
"""
|
27
36
|
Aggregates the data, ensuring each combination of 'index', 'col', and 'facet' is unique with summed 'value'.
|
28
|
-
|
37
|
+
|
29
38
|
Args:
|
30
39
|
df (pd.DataFrame): Input DataFrame.
|
31
40
|
top_n_index (int): top N values of the first column to keep. 0 means take all.
|
32
|
-
|
41
|
+
top_n_color (int): top N values of the second column to keep. 0 means take all.
|
33
42
|
top_n_facet (int): top N values of the third column to keep. 0 means take all.
|
34
43
|
null_label (str): Label for null values.
|
44
|
+
show_other (bool): Whether to include "<other>" for columns not in top_n_color. Defaults to False.
|
45
|
+
sort_values (bool): Whether to sort values in descending order based on group sum. Defaults to False.
|
35
46
|
|
36
47
|
Returns:
|
37
48
|
pd.DataFrame: Aggregated and filtered dataset.
|
38
49
|
"""
|
39
|
-
|
50
|
+
|
51
|
+
for col in ["index", "col", "facet"]: # Skip 'value' column (numeric)
|
40
52
|
df[col] = df[col].fillna(null_label)
|
41
53
|
|
42
54
|
# Aggregate data to ensure unique combinations
|
43
|
-
aggregated_df = df.groupby([
|
55
|
+
aggregated_df = df.groupby(["index", "col", "facet"], as_index=False)["value"].sum()
|
56
|
+
|
57
|
+
# * Reduce data based on top_n parameters
|
58
|
+
if sort_values_index:
|
59
|
+
top_indexes = (
|
60
|
+
aggregated_df.groupby("index")["value"]
|
61
|
+
.sum()
|
62
|
+
.sort_values(ascending=False)[:top_n_index or None]
|
63
|
+
.index
|
64
|
+
)
|
65
|
+
else:
|
66
|
+
top_indexes = aggregated_df["index"].sort_values().unique()[:top_n_index or None]
|
67
|
+
|
68
|
+
aggregated_df = aggregated_df[aggregated_df["index"].isin(top_indexes)]
|
69
|
+
|
70
|
+
if sort_values_color:
|
71
|
+
top_colors = (
|
72
|
+
aggregated_df.groupby("col")["value"]
|
73
|
+
.sum()
|
74
|
+
.sort_values(ascending=False)[:top_n_color or None]
|
75
|
+
.index
|
76
|
+
)
|
77
|
+
else:
|
78
|
+
top_colors = aggregated_df["col"].sort_values().unique()[:top_n_color or None]
|
79
|
+
|
80
|
+
others_df = df[~df["col"].isin(top_colors)]
|
81
|
+
aggregated_df = aggregated_df[aggregated_df["col"].isin(top_colors)]
|
82
|
+
if show_other and top_n_color > 0 and not others_df.empty:
|
83
|
+
other_agg = others_df.groupby(["index", "facet"], as_index=False)[
|
84
|
+
"value"
|
85
|
+
].sum()
|
86
|
+
other_agg["col"] = "<other>"
|
87
|
+
other_agg = other_agg[["index", "col", "facet", "value"]]
|
88
|
+
aggregated_df = pd.concat([aggregated_df, other_agg], ignore_index=True)
|
89
|
+
top_colors = [*top_colors, "<other>"]
|
90
|
+
|
91
|
+
if sort_values_facet:
|
92
|
+
top_facets = (
|
93
|
+
aggregated_df.groupby("facet")["value"]
|
94
|
+
.sum()
|
95
|
+
.sort_values(ascending=False)[:top_n_facet or None]
|
96
|
+
.index
|
97
|
+
)
|
98
|
+
else:
|
99
|
+
top_facets = aggregated_df["facet"].sort_values().unique()[:top_n_facet or None]
|
100
|
+
|
101
|
+
aggregated_df = aggregated_df[aggregated_df["facet"].isin(top_facets)]
|
102
|
+
|
103
|
+
# * Ensure facets are sorted alphabetically
|
104
|
+
aggregated_df["facet"] = pd.Categorical(
|
105
|
+
values=aggregated_df["facet"],
|
106
|
+
categories=top_facets,
|
107
|
+
ordered=True,
|
108
|
+
)
|
109
|
+
|
110
|
+
aggregated_df["index"] = pd.Categorical(
|
111
|
+
values=aggregated_df["index"],
|
112
|
+
categories=top_indexes,
|
113
|
+
ordered=True,
|
114
|
+
)
|
115
|
+
|
116
|
+
aggregated_df["col"] = pd.Categorical(
|
117
|
+
values=aggregated_df["col"],
|
118
|
+
categories=top_colors,
|
119
|
+
ordered=True,
|
120
|
+
)
|
44
121
|
|
45
|
-
# Reduce data based on top_n parameters
|
46
|
-
if top_n_index > 0:
|
47
|
-
top_indexes = aggregated_df.groupby('index')['value'].sum().nlargest(top_n_index).index
|
48
|
-
aggregated_df = aggregated_df[aggregated_df['index'].isin(top_indexes)]
|
49
|
-
if top_n_columns > 0:
|
50
|
-
top_columns = aggregated_df.groupby('col')['value'].sum().nlargest(top_n_columns).index
|
51
|
-
aggregated_df = aggregated_df[aggregated_df['col'].isin(top_columns)]
|
52
|
-
if top_n_facet > 0:
|
53
|
-
top_facets = aggregated_df.groupby('facet')['value'].sum().nlargest(top_n_facet).index
|
54
|
-
aggregated_df = aggregated_df[aggregated_df['facet'].isin(top_facets)]
|
55
122
|
|
56
|
-
#
|
57
|
-
aggregated_df['facet'] = pd.Categorical(aggregated_df['facet'], sorted(aggregated_df['facet'].unique()))
|
58
|
-
aggregated_df = aggregated_df.sort_values(by='facet')
|
123
|
+
# aggregated_df = aggregated_df.sort_values(by="facet")
|
59
124
|
|
60
125
|
return aggregated_df
|
61
126
|
|
@@ -76,13 +141,15 @@ def assign_column_colors(columns, color_palette, null_label):
|
|
76
141
|
palette = getattr(px.colors.qualitative, color_palette)
|
77
142
|
else:
|
78
143
|
raise ValueError(f"Invalid color palette: {color_palette}")
|
79
|
-
|
144
|
+
|
80
145
|
colors = {col: palette[i % len(palette)] for i, col in enumerate(sorted(columns))}
|
81
146
|
colors[null_label] = "lightgray"
|
82
147
|
return colors
|
83
148
|
|
149
|
+
|
84
150
|
### main functions
|
85
151
|
|
152
|
+
|
86
153
|
def plot_quadrants(
|
87
154
|
df: pd.DataFrame,
|
88
155
|
title: str = None,
|
@@ -162,7 +229,7 @@ def plot_quadrants(
|
|
162
229
|
|
163
230
|
# * save to png if path is provided
|
164
231
|
if png_path is not None:
|
165
|
-
plt.savefig(Path(png_path).as_posix(), format=
|
232
|
+
plt.savefig(Path(png_path).as_posix(), format="png")
|
166
233
|
|
167
234
|
return q1, q2, q3, q4, n
|
168
235
|
# * plotly express is not used for the heatmap, although it does not need the derived wide format.
|
@@ -184,12 +251,15 @@ def plot_stacked_bars(
|
|
184
251
|
renderer: Literal["png", "svg", None] = "png",
|
185
252
|
caption: str = None,
|
186
253
|
sort_values: bool = False,
|
254
|
+
sort_values_index: bool = False,
|
255
|
+
sort_values_color: bool = False,
|
187
256
|
show_total: bool = False,
|
188
257
|
precision: int = 0,
|
189
258
|
png_path: Path | str = None,
|
190
259
|
color_palette: str = "Plotly",
|
191
260
|
null_label: str = "<NA>",
|
192
|
-
|
261
|
+
show_other: bool = False,
|
262
|
+
) -> plotly.graph_objects:
|
193
263
|
"""
|
194
264
|
Generates a stacked bar plot using the provided DataFrame.
|
195
265
|
|
@@ -207,7 +277,7 @@ def plot_stacked_bars(
|
|
207
277
|
- title (str): Custom title for the plot.
|
208
278
|
- renderer (Literal["png", "svg", None]): Defines the output format.
|
209
279
|
- caption (str): Optional caption for additional context.
|
210
|
-
- sort_values (bool):
|
280
|
+
- sort_values (bool):
|
211
281
|
- If True, sorts bars by the sum of their values (descending).
|
212
282
|
- If False, sorts bars alphabetically.
|
213
283
|
- show_total (bool): If True, adds a row with the total sum of all categories.
|
@@ -215,7 +285,10 @@ def plot_stacked_bars(
|
|
215
285
|
- png_path (Path | str): If specified, saves the plot as a PNG file.
|
216
286
|
- color_palette (str): Name of the color palette to use.
|
217
287
|
- null_label (str): Label for null values.
|
218
|
-
|
288
|
+
- show_other (bool): If True, shows the "Other" category in the legend.
|
289
|
+
- sort_values_index (bool): If True, sorts the index categories by group sum
|
290
|
+
- sort_values_color (bool): If True, sorts the columns categories by group sum
|
291
|
+
|
219
292
|
Returns:
|
220
293
|
- A Plotly figure object representing the stacked bar chart.
|
221
294
|
"""
|
@@ -226,9 +299,19 @@ def plot_stacked_bars(
|
|
226
299
|
print("❌ df must have exactly 2 or 3 columns")
|
227
300
|
return
|
228
301
|
|
229
|
-
#
|
230
|
-
|
231
|
-
|
302
|
+
# ! do not enforce str columns anymore
|
303
|
+
# # * check if first 2 columns are str
|
304
|
+
# dtypes = set(df.iloc[:, [0, 1]].dtypes)
|
305
|
+
# dtypes_kind = [i.kind for i in dtypes]
|
306
|
+
|
307
|
+
# if set(dtypes_kind) - set(["O", "b"]):
|
308
|
+
# print("❌ first 2 columns must be str")
|
309
|
+
# # * overkill ^^
|
310
|
+
# df.iloc[:, [0, 1]] = df.iloc[:, [0, 1]].astype(str)
|
311
|
+
|
312
|
+
# * but last col must be numeric
|
313
|
+
if df.iloc[:, -1].dtype.kind not in ("f", "i"):
|
314
|
+
print("❌ last column must be numeric")
|
232
315
|
return
|
233
316
|
|
234
317
|
df = df.copy() # Copy the input DataFrame to avoid modifying the original
|
@@ -252,69 +335,103 @@ def plot_stacked_bars(
|
|
252
335
|
# * apply precision
|
253
336
|
df.iloc[:, 2] = df.iloc[:, 2].round(precision)
|
254
337
|
|
255
|
-
# * set index + color col
|
338
|
+
# # * set index + color col
|
256
339
|
col_index = df.columns[0] if not swap else df.columns[1]
|
257
340
|
col_color = df.columns[1] if not swap else df.columns[0]
|
258
341
|
|
259
342
|
# * ensure df is grouped to prevent false aggregations
|
260
|
-
df = (
|
261
|
-
df.groupby([df.columns[0], df.columns[1]])
|
262
|
-
[df.columns[2]]
|
263
|
-
.sum()
|
264
|
-
.reset_index()
|
265
|
-
)
|
266
|
-
|
267
|
-
# * Sorting logic based on sort_values
|
268
|
-
if sort_values:
|
269
|
-
sort_order = (
|
270
|
-
df.groupby(col_index)[df.columns[2]].sum().sort_values(ascending=False).index
|
271
|
-
)
|
272
|
-
else:
|
273
|
-
sort_order = sorted(df[col_index].unique()) # Alphabetical order
|
274
|
-
df[col_index] = pd.Categorical(df[col_index], categories=sort_order, ordered=True)
|
343
|
+
df = df.groupby([df.columns[0], df.columns[1]])[df.columns[2]].sum().reset_index()
|
275
344
|
|
276
345
|
# * add total as aggregation of df
|
277
346
|
if show_total:
|
278
|
-
df_total = df.
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
df[col_index] = pd.Categorical(df[col_index], categories=sort_order, ordered=True)
|
284
|
-
|
285
|
-
if top_n_index > 0 and len(sort_order) > top_n_index:
|
286
|
-
top_categories = sort_order[:top_n_index]
|
287
|
-
df[col_index] = df[col_index].apply(lambda x: x if x in top_categories else "<other>")
|
288
|
-
|
289
|
-
unique_colors = sorted(df[col_color].unique())
|
290
|
-
if top_n_color > 0 and len(unique_colors) > top_n_color:
|
291
|
-
top_colors = unique_colors[:top_n_color]
|
292
|
-
df[col_color] = df[col_color].apply(lambda x: x if x in top_colors else "<other>")
|
293
|
-
|
294
|
-
column_colors = assign_column_colors(sorted(df[col_color].unique()), color_palette, null_label)
|
295
|
-
|
296
|
-
# # * assign colors to columns
|
297
|
-
# unique_colors = sorted(df[col_color].unique())
|
298
|
-
# column_colors = assign_column_colors(unique_colors, color_palette, null_label)
|
347
|
+
df_total = df.groupby(df.columns[1], observed=True, as_index=False)[
|
348
|
+
df.columns[2]
|
349
|
+
].sum()
|
350
|
+
df_total[df.columns[0]] = " Total"
|
351
|
+
df = pd.concat([df, df_total], ignore_index=True)
|
299
352
|
|
300
353
|
# * calculate n
|
301
354
|
divider = 2 if show_total else 1
|
302
|
-
n = int(df
|
355
|
+
n = int(df.iloc[:, 2].sum() / divider)
|
303
356
|
|
304
357
|
# * title str
|
305
358
|
_title_str_top_index = f"TOP{top_n_index} " if top_n_index > 0 else ""
|
306
359
|
_title_str_top_color = f"TOP{top_n_color} " if top_n_color > 0 else ""
|
307
360
|
_title_str_null = f", NULL excluded" if dropna else ""
|
308
361
|
_title_str_n = f", n={n:_}"
|
362
|
+
|
363
|
+
_df = df.copy().assign(facet=None)
|
364
|
+
_df.columns = (
|
365
|
+
["index", "col", "value", "facet"]
|
366
|
+
if not swap
|
367
|
+
else ["col", "index", "value", "facet"]
|
368
|
+
)
|
369
|
+
|
370
|
+
aggregated_df = aggregate_data(
|
371
|
+
df=_df,
|
372
|
+
top_n_index=top_n_index,
|
373
|
+
top_n_color=top_n_color,
|
374
|
+
top_n_facet=0,
|
375
|
+
null_label=null_label,
|
376
|
+
show_other=show_other,
|
377
|
+
sort_values_index=sort_values_index,
|
378
|
+
sort_values_color=sort_values_color,
|
379
|
+
sort_values_facet=False, # just a placeholder
|
380
|
+
)
|
381
|
+
|
382
|
+
df = aggregated_df.copy()
|
383
|
+
|
384
|
+
columns = sorted(
|
385
|
+
df.groupby("col", observed=True)["value"]
|
386
|
+
.sum()
|
387
|
+
.sort_values(ascending=False)
|
388
|
+
.index.tolist()
|
389
|
+
)
|
390
|
+
column_colors = assign_column_colors(columns, color_palette, null_label)
|
391
|
+
|
309
392
|
caption = _set_caption(caption)
|
310
393
|
|
394
|
+
# * after grouping add cols for pct and formatting
|
395
|
+
df["cnt_pct_only"] = df["value"].apply(lambda x: f"{(x / n) * 100:.{precision}f}%")
|
396
|
+
|
397
|
+
# * format output
|
398
|
+
df["cnt_str"] = df["value"].apply(lambda x: f"{x:_.{precision}f}")
|
399
|
+
|
400
|
+
divider2 = "<br>" if orientation == "v" else " "
|
401
|
+
df["cnt_pct_str"] = df.apply(
|
402
|
+
lambda row: f"{row['cnt_str']}{divider2}({row['cnt_pct_only']})", axis=1
|
403
|
+
)
|
404
|
+
|
405
|
+
# # # * Sorting logic based on sort_values
|
406
|
+
# if sort_values_index:
|
407
|
+
# sort_order = (
|
408
|
+
# df.groupby("index")["value"].sum().sort_values(ascending=False).index
|
409
|
+
# )
|
410
|
+
# else:
|
411
|
+
# sort_order = sorted(df["index"].unique(), reverse=False) # Alphabetical order
|
412
|
+
|
413
|
+
# display(sort_order)
|
414
|
+
|
415
|
+
# df["index"] = pd.Categorical(
|
416
|
+
# values=df["index"],
|
417
|
+
# # categories=sort_order,
|
418
|
+
# ordered=True,
|
419
|
+
# )
|
420
|
+
df = (
|
421
|
+
df.sort_values(by="index", ascending=False)
|
422
|
+
if orientation == "h"
|
423
|
+
else df.sort_values(by="index", ascending=True)
|
424
|
+
)
|
425
|
+
|
426
|
+
# display(df)
|
427
|
+
|
311
428
|
# * plot
|
312
429
|
fig = px.bar(
|
313
430
|
df,
|
314
|
-
x=
|
315
|
-
y=
|
316
|
-
color=
|
317
|
-
text=
|
431
|
+
x="index" if orientation == "v" else "value",
|
432
|
+
y="value" if orientation == "v" else "index",
|
433
|
+
color="col",
|
434
|
+
text="cnt_pct_str" if normalize else "cnt_str",
|
318
435
|
orientation=orientation,
|
319
436
|
title=title
|
320
437
|
or f"{caption}{_title_str_top_index}[{col_index}] by {_title_str_top_color}[{col_color}]{_title_str_null}{_title_str_n}",
|
@@ -322,13 +439,15 @@ def plot_stacked_bars(
|
|
322
439
|
width=width,
|
323
440
|
height=height,
|
324
441
|
color_discrete_map=column_colors, # Use assigned colors
|
325
|
-
category_orders={
|
326
|
-
|
327
|
-
|
442
|
+
category_orders={
|
443
|
+
col_index: list(df["index"].cat.categories)
|
444
|
+
}, # <- Add this line
|
328
445
|
)
|
329
|
-
|
446
|
+
|
447
|
+
|
448
|
+
# * get longest bar
|
330
449
|
bar_max = (
|
331
|
-
df.groupby(
|
450
|
+
df.groupby("index")["value"].sum().sort_values(ascending=False).iloc[0]
|
332
451
|
* BAR_LENGTH_MULTIPLIER
|
333
452
|
)
|
334
453
|
# * ignore if bar mode is on
|
@@ -352,7 +471,7 @@ def plot_stacked_bars(
|
|
352
471
|
},
|
353
472
|
},
|
354
473
|
)
|
355
|
-
|
474
|
+
|
356
475
|
# * set dtick
|
357
476
|
if orientation == "h":
|
358
477
|
if relative:
|
@@ -488,7 +607,10 @@ def plot_bars(
|
|
488
607
|
|
489
608
|
# * after grouping add cols for pct and formatting
|
490
609
|
df["pct"] = df[df.columns[1]] / n
|
610
|
+
|
611
|
+
# * format output
|
491
612
|
df["cnt_str"] = df[df.columns[1]].apply(lambda x: f"{x:_.{precision}f}")
|
613
|
+
|
492
614
|
divider = "<br>" if orientation == "v" else " "
|
493
615
|
df["cnt_pct_str"] = df.apply(
|
494
616
|
lambda row: f"{row['cnt_str']}{divider}({row['pct']:.1%})", axis=1
|
@@ -669,7 +791,7 @@ def plot_histogram(
|
|
669
791
|
caption (str): The caption for the plot. Default is None.
|
670
792
|
title (str): The title of the plot. Default is None.
|
671
793
|
png_path (Path | str, optional): The path to save the image as a png file. Defaults to None.
|
672
|
-
|
794
|
+
|
673
795
|
|
674
796
|
Returns:
|
675
797
|
plot object
|
@@ -721,7 +843,7 @@ def plot_histogram(
|
|
721
843
|
)
|
722
844
|
|
723
845
|
fig.show(renderer)
|
724
|
-
|
846
|
+
|
725
847
|
# * save to png if path is provided
|
726
848
|
if png_path is not None:
|
727
849
|
fig.write_image(Path(png_path).as_posix())
|
@@ -991,7 +1113,7 @@ def plot_boxes(
|
|
991
1113
|
points: Literal["all", "outliers", "suspectedoutliers", None] = None,
|
992
1114
|
precision: int = 2,
|
993
1115
|
height: int = 600,
|
994
|
-
width: int =
|
1116
|
+
width: int = 1200,
|
995
1117
|
annotations: bool = True,
|
996
1118
|
summary: bool = True,
|
997
1119
|
title: str = None,
|
@@ -1018,7 +1140,7 @@ def plot_boxes(
|
|
1018
1140
|
if (
|
1019
1141
|
len(df.columns) != 2
|
1020
1142
|
or not (
|
1021
|
-
(pd.api.types.
|
1143
|
+
(pd.api.types.is_object_dtype(df.iloc[:, 0]))
|
1022
1144
|
or (pd.api.types.is_bool_dtype(df.iloc[:, 0]))
|
1023
1145
|
)
|
1024
1146
|
or not pd.api.types.is_numeric_dtype(df.iloc[:, 1])
|
@@ -1124,7 +1246,7 @@ def plot_boxes(
|
|
1124
1246
|
|
1125
1247
|
fig.show("png")
|
1126
1248
|
if summary:
|
1127
|
-
print_summary(df)
|
1249
|
+
print_summary(df=df, precision=precision)
|
1128
1250
|
|
1129
1251
|
# * save to png if path is provided
|
1130
1252
|
if png_path is not None:
|
@@ -1133,12 +1255,11 @@ def plot_boxes(
|
|
1133
1255
|
return fig
|
1134
1256
|
|
1135
1257
|
|
1136
|
-
|
1137
1258
|
def plot_facet_stacked_bars(
|
1138
1259
|
df: pd.DataFrame,
|
1139
1260
|
subplots_per_row: int = 4,
|
1140
1261
|
top_n_index: int = 0,
|
1141
|
-
|
1262
|
+
top_n_color: int = 0,
|
1142
1263
|
top_n_facet: int = 0,
|
1143
1264
|
null_label: str = "<NA>",
|
1144
1265
|
subplot_size: int = 300,
|
@@ -1148,6 +1269,12 @@ def plot_facet_stacked_bars(
|
|
1148
1269
|
annotations: bool = False,
|
1149
1270
|
precision: int = 0,
|
1150
1271
|
png_path: Optional[Path] = None,
|
1272
|
+
show_other: bool = False,
|
1273
|
+
sort_values: bool = True,
|
1274
|
+
sort_values_index: bool = False,
|
1275
|
+
sort_values_color: bool = False,
|
1276
|
+
sort_values_facet: bool = False,
|
1277
|
+
|
1151
1278
|
) -> object:
|
1152
1279
|
"""
|
1153
1280
|
Create a grid of stacked bar charts.
|
@@ -1156,7 +1283,7 @@ def plot_facet_stacked_bars(
|
|
1156
1283
|
df (pd.DataFrame): DataFrame with 3 or 4 columns.
|
1157
1284
|
subplots_per_row (int): Number of subplots per row.
|
1158
1285
|
top_n_index (int): top N index values to keep.
|
1159
|
-
|
1286
|
+
top_n_color (int): top N column values to keep.
|
1160
1287
|
top_n_facet (int): top N facet values to keep.
|
1161
1288
|
null_label (str): Label for null values.
|
1162
1289
|
subplot_size (int): Size of each subplot.
|
@@ -1166,47 +1293,57 @@ def plot_facet_stacked_bars(
|
|
1166
1293
|
annotations (bool): Whether to show annotations in the subplots.
|
1167
1294
|
precision (int): Decimal precision for annotations.
|
1168
1295
|
png_path (Optional[Path]): Path to save the image.
|
1296
|
+
show_other (bool): If True, adds an "<other>" bar for columns not in top_n_color.
|
1297
|
+
sort_values_index (bool): If True, sorts index by group sum.
|
1298
|
+
sort_values_color (bool): If True, sorts columns by group sum.
|
1299
|
+
sort_values_facet (bool): If True, sorts facet by group sum.
|
1300
|
+
sort_values (bool): DEPRECATED
|
1301
|
+
|
1169
1302
|
|
1170
1303
|
Returns:
|
1171
1304
|
plot object
|
1172
|
-
|
1305
|
+
|
1173
1306
|
Remarks:
|
1174
1307
|
If you need to include facets that have no data, fill up like this beforehand:
|
1175
1308
|
df.loc[len(df)]=[None, None, 12]
|
1176
1309
|
"""
|
1177
|
-
|
1310
|
+
|
1178
1311
|
df = df.copy() # Copy the input DataFrame to avoid modifying the original
|
1179
1312
|
|
1180
1313
|
if not (df.shape[1] == 3 or df.shape[1] == 4):
|
1181
1314
|
raise ValueError("Input DataFrame must have 3 or 4 columns.")
|
1182
|
-
|
1315
|
+
|
1183
1316
|
original_column_names = df.columns.tolist()
|
1317
|
+
original_rows = len(df)
|
1184
1318
|
|
1185
1319
|
if df.shape[1] == 3:
|
1186
|
-
df.columns = [
|
1187
|
-
df[
|
1320
|
+
df.columns = ["index", "col", "facet"]
|
1321
|
+
df["value"] = 1
|
1188
1322
|
elif df.shape[1] == 4:
|
1189
|
-
df.columns = [
|
1190
|
-
|
1191
|
-
aggregated_df = aggregate_data(df, top_n_index, top_n_columns, top_n_facet, null_label)
|
1192
|
-
|
1193
|
-
# facets = aggregated_df['facet'].unique()
|
1194
|
-
facets = sorted(aggregated_df['facet'].unique()) # Ensure facets are sorted consistently
|
1323
|
+
df.columns = ["index", "col", "facet", "value"]
|
1195
1324
|
|
1196
|
-
|
1197
|
-
|
1198
|
-
|
1199
|
-
|
1200
|
-
|
1201
|
-
|
1202
|
-
|
1203
|
-
|
1204
|
-
|
1205
|
-
|
1325
|
+
aggregated_df = aggregate_data(
|
1326
|
+
df,
|
1327
|
+
top_n_index,
|
1328
|
+
top_n_color,
|
1329
|
+
top_n_facet,
|
1330
|
+
null_label,
|
1331
|
+
show_other=show_other,
|
1332
|
+
sort_values_index=sort_values_index,
|
1333
|
+
sort_values_color=sort_values_color,
|
1334
|
+
sort_values_facet=sort_values_facet,
|
1335
|
+
)
|
1206
1336
|
|
1337
|
+
facets = sorted(
|
1338
|
+
aggregated_df["facet"].unique()
|
1339
|
+
) # Ensure facets are sorted consistently
|
1207
1340
|
|
1208
|
-
|
1209
|
-
|
1341
|
+
columns = sorted(
|
1342
|
+
aggregated_df.groupby("col", observed=True)["value"]
|
1343
|
+
.sum()
|
1344
|
+
.sort_values(ascending=False)
|
1345
|
+
.index.tolist()
|
1346
|
+
)
|
1210
1347
|
column_colors = assign_column_colors(columns, color_palette, null_label)
|
1211
1348
|
|
1212
1349
|
fig = make_subplots(
|
@@ -1215,25 +1352,39 @@ def plot_facet_stacked_bars(
|
|
1215
1352
|
subplot_titles=facets,
|
1216
1353
|
)
|
1217
1354
|
|
1355
|
+
# * Ensure all categories appear in the legend by adding an invisible trace
|
1356
|
+
for column in columns:
|
1357
|
+
fig.add_trace(
|
1358
|
+
go.Bar(
|
1359
|
+
x=[None], # Invisible bar
|
1360
|
+
y=[None],
|
1361
|
+
name=column,
|
1362
|
+
marker=dict(color=column_colors[column]),
|
1363
|
+
showlegend=True, # Ensure it appears in the legend
|
1364
|
+
)
|
1365
|
+
)
|
1366
|
+
|
1218
1367
|
added_to_legend = set()
|
1219
1368
|
for i, facet in enumerate(facets):
|
1220
|
-
facet_data = aggregated_df[aggregated_df[
|
1369
|
+
facet_data = aggregated_df[aggregated_df["facet"] == facet]
|
1221
1370
|
row = (i // subplots_per_row) + 1
|
1222
1371
|
col = (i % subplots_per_row) + 1
|
1223
1372
|
|
1224
1373
|
for column in columns:
|
1225
|
-
column_data = facet_data[facet_data[
|
1374
|
+
column_data = facet_data[facet_data["col"] == column]
|
1375
|
+
|
1226
1376
|
show_legend = column not in added_to_legend
|
1227
1377
|
if show_legend:
|
1228
1378
|
added_to_legend.add(column)
|
1229
1379
|
|
1230
1380
|
fig.add_trace(
|
1231
1381
|
go.Bar(
|
1232
|
-
x=column_data[
|
1233
|
-
y=column_data[
|
1382
|
+
x=column_data["index"],
|
1383
|
+
y=column_data["value"],
|
1234
1384
|
name=column,
|
1235
1385
|
marker=dict(color=column_colors[column]),
|
1236
|
-
|
1386
|
+
legendgroup=column, # Ensures multiple traces use the same legend entry
|
1387
|
+
showlegend=False, # suppress further legend items
|
1237
1388
|
),
|
1238
1389
|
row=row,
|
1239
1390
|
col=col,
|
@@ -1242,8 +1393,8 @@ def plot_facet_stacked_bars(
|
|
1242
1393
|
if annotations:
|
1243
1394
|
for _, row_data in column_data.iterrows():
|
1244
1395
|
fig.add_annotation(
|
1245
|
-
x=row_data[
|
1246
|
-
y=row_data[
|
1396
|
+
x=row_data["index"],
|
1397
|
+
y=row_data["value"],
|
1247
1398
|
text=f"{row_data['value']:.{precision}f}",
|
1248
1399
|
showarrow=False,
|
1249
1400
|
row=row,
|
@@ -1257,8 +1408,8 @@ def plot_facet_stacked_bars(
|
|
1257
1408
|
else:
|
1258
1409
|
axis_details.append(f"[{original_column_names[0]}]")
|
1259
1410
|
|
1260
|
-
if
|
1261
|
-
axis_details.append(f"TOP {
|
1411
|
+
if top_n_color > 0:
|
1412
|
+
axis_details.append(f"TOP {top_n_color} [{original_column_names[1]}]")
|
1262
1413
|
else:
|
1263
1414
|
axis_details.append(f"[{original_column_names[1]}]")
|
1264
1415
|
|
@@ -1267,7 +1418,7 @@ def plot_facet_stacked_bars(
|
|
1267
1418
|
else:
|
1268
1419
|
axis_details.append(f"[{original_column_names[2]}]")
|
1269
1420
|
|
1270
|
-
title = f"{caption} {', '.join(axis_details)}, n = {
|
1421
|
+
title = f"{caption} {', '.join(axis_details)}, n = {original_rows:_}"
|
1271
1422
|
template = "plotly_dark" if os.getenv("THEME") == "dark" else "plotly"
|
1272
1423
|
fig.update_layout(
|
1273
1424
|
title=title,
|
@@ -15,7 +15,7 @@ from plotly.subplots import make_subplots
|
|
15
15
|
from scipy import stats
|
16
16
|
import dataframe_image as dfi
|
17
17
|
|
18
|
-
from .hlp import wrap_text
|
18
|
+
from .hlp import wrap_text, to_series
|
19
19
|
|
20
20
|
import duckdb as ddb
|
21
21
|
|
@@ -696,7 +696,7 @@ def show_num_df(
|
|
696
696
|
|
697
697
|
|
698
698
|
|
699
|
-
def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
|
699
|
+
def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str=" ", precision: int=3):
|
700
700
|
"""
|
701
701
|
Print statistical summary for a pandas DataFrame or Series.
|
702
702
|
|
@@ -712,15 +712,44 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
|
|
712
712
|
df (Union[pd.DataFrame, pd.Series]): Input DataFrame or Series. Only numeric columns
|
713
713
|
in DataFrame are considered.
|
714
714
|
show (bool, optional): Whether to print the summary. Defaults to True.
|
715
|
-
name (str, optional): Prefix for the summary. Defaults to "
|
715
|
+
name (str, optional): Prefix for the summary. Defaults to " ".
|
716
|
+
precision (int, optional): Number of digits to round the results to. Defaults to 3.
|
716
717
|
"""
|
717
718
|
if df.empty:
|
718
719
|
return
|
719
720
|
|
720
721
|
# * drop NA to keep scipy sane
|
721
|
-
df = df.dropna().copy()
|
722
|
+
df = df.dropna().copy()
|
722
723
|
|
723
|
-
|
724
|
+
# display(df)
|
725
|
+
|
726
|
+
if len(df.columns) == 1:
|
727
|
+
df = df.to_series()
|
728
|
+
|
729
|
+
pd.api.types.is_numeric_dtype(df)
|
730
|
+
|
731
|
+
|
732
|
+
if not (
|
733
|
+
# * series must be numeric
|
734
|
+
(isinstance(df, pd.Series)
|
735
|
+
and pd.api.types.is_numeric_dtype(df)
|
736
|
+
)
|
737
|
+
or
|
738
|
+
# * df must have 2 columns str num
|
739
|
+
(len(df.columns) == 2
|
740
|
+
and (
|
741
|
+
(pd.api.types.is_object_dtype(df.iloc[:, 0]))
|
742
|
+
or (pd.api.types.is_bool_dtype(df.iloc[:, 0]))
|
743
|
+
)
|
744
|
+
and pd.api.types.is_numeric_dtype(df.iloc[:, 1])
|
745
|
+
)
|
746
|
+
):
|
747
|
+
print(f"❌ df must have 2 columns: [0] str or bool, [1] num, or be a series")
|
748
|
+
return
|
749
|
+
|
750
|
+
|
751
|
+
|
752
|
+
def print_summary_ser(ser: pd.Series, show: bool=True, name: str="", precision: int=3):
|
724
753
|
# Calculate IQR and pass `rng=(25, 75)` to get the interquartile range
|
725
754
|
iqr_value = stats.iqr(ser)
|
726
755
|
|
@@ -728,21 +757,21 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
|
|
728
757
|
# ser.dropna(inplace=True)
|
729
758
|
|
730
759
|
# Using the iqr function, we still calculate the bounds manually
|
731
|
-
q1 = stats.scoreatpercentile(ser, 25)
|
732
|
-
q3 = stats.scoreatpercentile(ser, 75)
|
760
|
+
q1 = round(stats.scoreatpercentile(ser, 25), precision)
|
761
|
+
q3 = round(stats.scoreatpercentile(ser, 75), precision)
|
733
762
|
|
734
763
|
# Calculate upper bound directly
|
735
|
-
min = round(ser.min(),
|
736
|
-
med = round(ser.median(),
|
737
|
-
upper = round(q3 + 1.5 * iqr_value,
|
738
|
-
lower = round(q1 - 1.5 * iqr_value,
|
739
|
-
mean = round(ser.mean(),
|
740
|
-
std = round(ser.std(),
|
741
|
-
cv = round(ser.std() / ser.mean(),
|
742
|
-
max = round(ser.max(),
|
743
|
-
sum = round(ser.sum(),
|
744
|
-
skew = round(stats.skew(ser.dropna().tolist()),
|
745
|
-
kurto = round(stats.kurtosis(ser.dropna().tolist()),
|
764
|
+
min = round(ser.min(), precision)
|
765
|
+
med = round(ser.median(), precision)
|
766
|
+
upper = round(q3 + 1.5 * iqr_value, precision)
|
767
|
+
lower = round(q1 - 1.5 * iqr_value, precision)
|
768
|
+
mean = round(ser.mean(), precision)
|
769
|
+
std = round(ser.std(), precision)
|
770
|
+
cv = round(ser.std() / ser.mean(), precision)
|
771
|
+
max = round(ser.max(), precision)
|
772
|
+
sum = round(ser.sum(), precision)
|
773
|
+
skew = round(stats.skew(ser.dropna().tolist()), precision)
|
774
|
+
kurto = round(stats.kurtosis(ser.dropna().tolist()), precision)
|
746
775
|
|
747
776
|
lower = min if lower < min else lower
|
748
777
|
upper = max if upper > max else upper
|
@@ -750,7 +779,7 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
|
|
750
779
|
# * extra care for scipy metrics, these are very vulnarable to nan
|
751
780
|
if show:
|
752
781
|
print(
|
753
|
-
f"""{name} min: {min:_} | lower: {lower:_} | q25: {q1:_} | median: {med:_} | mean: {mean:_} | q75: {q3:_} | upper: {upper:_} | max: {max:_} | std: {std:_} | cv: {cv:_} | sum: {sum:_} | skew: {skew} | kurto: {kurto}""")
|
782
|
+
f"""{name} -> min: {min:_} | lower: {lower:_} | q25: {q1:_} | median: {med:_} | mean: {mean:_} | q75: {q3:_} | upper: {upper:_} | max: {max:_} | std: {std:_} | cv: {cv:_} | sum: {sum:_} | skew: {skew} | kurto: {kurto}""")
|
754
783
|
|
755
784
|
summary = {
|
756
785
|
"min": min,
|
@@ -770,11 +799,22 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
|
|
770
799
|
return summary
|
771
800
|
|
772
801
|
if isinstance(df, pd.Series):
|
773
|
-
|
802
|
+
# * print serie
|
803
|
+
name = df.name if df.name else "series"
|
804
|
+
print_summary_ser(ser=df, show=show, name=name, precision=precision)
|
805
|
+
return
|
774
806
|
|
775
807
|
if isinstance(df, pd.DataFrame):
|
776
|
-
# *
|
777
|
-
|
778
|
-
|
808
|
+
# * print for all values
|
809
|
+
print(f"🟧 all data")
|
810
|
+
name = df.columns[-1]
|
811
|
+
summary = print_summary_ser(ser=df.iloc[:,1], show=show, name=name, precision=precision)
|
812
|
+
|
813
|
+
print(f"🟧 boxes")
|
814
|
+
# * print for each value
|
815
|
+
for item in df.iloc[:,0].unique():
|
816
|
+
# display(df[df.iloc[:,0] == item])
|
817
|
+
print_summary_ser(ser=df[df.iloc[:,0] == item].iloc[:,1], show=show, name=item, precision=precision)
|
779
818
|
|
780
819
|
return summary
|
820
|
+
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|