pandas-plots 0.12.6__py3-none-any.whl → 0.12.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pandas_plots/pls.py CHANGED
@@ -1,7 +1,4 @@
1
1
  from pathlib import Path
2
- import warnings
3
-
4
- warnings.filterwarnings("ignore")
5
2
 
6
3
  import os
7
4
  from typing import Optional, Literal
@@ -12,50 +9,118 @@ from matplotlib import pyplot as plt
12
9
  from plotly import express as px
13
10
  import plotly.graph_objects as go
14
11
  from plotly.subplots import make_subplots
12
+ import plotly # needed for return types
15
13
 
16
14
  from .hlp import *
17
15
  from .tbl import print_summary
18
16
 
19
17
  ### helper functions
20
18
 
19
+
21
20
  def _set_caption(caption: str) -> str:
22
21
  return f"#️⃣{'-'.join(caption.split())}, " if caption else ""
23
22
 
24
23
 
25
- def aggregate_data(df: pd.DataFrame, top_n_index: int, top_n_columns: int, top_n_facet: int, null_label: str) -> pd.DataFrame:
24
+ def aggregate_data(
25
+ df: pd.DataFrame,
26
+ top_n_index: int,
27
+ top_n_color: int,
28
+ top_n_facet: int,
29
+ null_label: str,
30
+ show_other: bool = False,
31
+ sort_values_index: bool = False,
32
+ sort_values_color: bool = False,
33
+ sort_values_facet: bool = False,
34
+ ) -> pd.DataFrame:
26
35
  """
27
36
  Aggregates the data, ensuring each combination of 'index', 'col', and 'facet' is unique with summed 'value'.
28
-
37
+
29
38
  Args:
30
39
  df (pd.DataFrame): Input DataFrame.
31
40
  top_n_index (int): top N values of the first column to keep. 0 means take all.
32
- top_n_columns (int): top N values of the second column to keep. 0 means take all.
41
+ top_n_color (int): top N values of the second column to keep. 0 means take all.
33
42
  top_n_facet (int): top N values of the third column to keep. 0 means take all.
34
43
  null_label (str): Label for null values.
44
+ show_other (bool): Whether to include "<other>" for columns not in top_n_color. Defaults to False.
45
+ sort_values (bool): Whether to sort values in descending order based on group sum. Defaults to False.
35
46
 
36
47
  Returns:
37
48
  pd.DataFrame: Aggregated and filtered dataset.
38
49
  """
39
- for col in ['index', 'col', 'facet']: # Skip 'value' column (numeric)
50
+
51
+ for col in ["index", "col", "facet"]: # Skip 'value' column (numeric)
40
52
  df[col] = df[col].fillna(null_label)
41
53
 
42
54
  # Aggregate data to ensure unique combinations
43
- aggregated_df = df.groupby(['index', 'col', 'facet'], as_index=False)['value'].sum()
55
+ aggregated_df = df.groupby(["index", "col", "facet"], as_index=False)["value"].sum()
56
+
57
+ # * Reduce data based on top_n parameters
58
+ if sort_values_index:
59
+ top_indexes = (
60
+ aggregated_df.groupby("index")["value"]
61
+ .sum()
62
+ .sort_values(ascending=False)[:top_n_index or None]
63
+ .index
64
+ )
65
+ else:
66
+ top_indexes = aggregated_df["index"].sort_values().unique()[:top_n_index or None]
67
+
68
+ aggregated_df = aggregated_df[aggregated_df["index"].isin(top_indexes)]
69
+
70
+ if sort_values_color:
71
+ top_colors = (
72
+ aggregated_df.groupby("col")["value"]
73
+ .sum()
74
+ .sort_values(ascending=False)[:top_n_color or None]
75
+ .index
76
+ )
77
+ else:
78
+ top_colors = aggregated_df["col"].sort_values().unique()[:top_n_color or None]
79
+
80
+ others_df = df[~df["col"].isin(top_colors)]
81
+ aggregated_df = aggregated_df[aggregated_df["col"].isin(top_colors)]
82
+ if show_other and top_n_color > 0 and not others_df.empty:
83
+ other_agg = others_df.groupby(["index", "facet"], as_index=False)[
84
+ "value"
85
+ ].sum()
86
+ other_agg["col"] = "<other>"
87
+ other_agg = other_agg[["index", "col", "facet", "value"]]
88
+ aggregated_df = pd.concat([aggregated_df, other_agg], ignore_index=True)
89
+ top_colors = [*top_colors, "<other>"]
90
+
91
+ if sort_values_facet:
92
+ top_facets = (
93
+ aggregated_df.groupby("facet")["value"]
94
+ .sum()
95
+ .sort_values(ascending=False)[:top_n_facet or None]
96
+ .index
97
+ )
98
+ else:
99
+ top_facets = aggregated_df["facet"].sort_values().unique()[:top_n_facet or None]
100
+
101
+ aggregated_df = aggregated_df[aggregated_df["facet"].isin(top_facets)]
102
+
103
+ # * Ensure facets are sorted alphabetically
104
+ aggregated_df["facet"] = pd.Categorical(
105
+ values=aggregated_df["facet"],
106
+ categories=top_facets,
107
+ ordered=True,
108
+ )
109
+
110
+ aggregated_df["index"] = pd.Categorical(
111
+ values=aggregated_df["index"],
112
+ categories=top_indexes,
113
+ ordered=True,
114
+ )
115
+
116
+ aggregated_df["col"] = pd.Categorical(
117
+ values=aggregated_df["col"],
118
+ categories=top_colors,
119
+ ordered=True,
120
+ )
44
121
 
45
- # Reduce data based on top_n parameters
46
- if top_n_index > 0:
47
- top_indexes = aggregated_df.groupby('index')['value'].sum().nlargest(top_n_index).index
48
- aggregated_df = aggregated_df[aggregated_df['index'].isin(top_indexes)]
49
- if top_n_columns > 0:
50
- top_columns = aggregated_df.groupby('col')['value'].sum().nlargest(top_n_columns).index
51
- aggregated_df = aggregated_df[aggregated_df['col'].isin(top_columns)]
52
- if top_n_facet > 0:
53
- top_facets = aggregated_df.groupby('facet')['value'].sum().nlargest(top_n_facet).index
54
- aggregated_df = aggregated_df[aggregated_df['facet'].isin(top_facets)]
55
122
 
56
- # Ensure facets are sorted alphabetically
57
- aggregated_df['facet'] = pd.Categorical(aggregated_df['facet'], sorted(aggregated_df['facet'].unique()))
58
- aggregated_df = aggregated_df.sort_values(by='facet')
123
+ # aggregated_df = aggregated_df.sort_values(by="facet")
59
124
 
60
125
  return aggregated_df
61
126
 
@@ -76,13 +141,15 @@ def assign_column_colors(columns, color_palette, null_label):
76
141
  palette = getattr(px.colors.qualitative, color_palette)
77
142
  else:
78
143
  raise ValueError(f"Invalid color palette: {color_palette}")
79
-
144
+
80
145
  colors = {col: palette[i % len(palette)] for i, col in enumerate(sorted(columns))}
81
146
  colors[null_label] = "lightgray"
82
147
  return colors
83
148
 
149
+
84
150
  ### main functions
85
151
 
152
+
86
153
  def plot_quadrants(
87
154
  df: pd.DataFrame,
88
155
  title: str = None,
@@ -162,7 +229,7 @@ def plot_quadrants(
162
229
 
163
230
  # * save to png if path is provided
164
231
  if png_path is not None:
165
- plt.savefig(Path(png_path).as_posix(), format='png')
232
+ plt.savefig(Path(png_path).as_posix(), format="png")
166
233
 
167
234
  return q1, q2, q3, q4, n
168
235
  # * plotly express is not used for the heatmap, although it does not need the derived wide format.
@@ -184,12 +251,15 @@ def plot_stacked_bars(
184
251
  renderer: Literal["png", "svg", None] = "png",
185
252
  caption: str = None,
186
253
  sort_values: bool = False,
254
+ sort_values_index: bool = False,
255
+ sort_values_color: bool = False,
187
256
  show_total: bool = False,
188
257
  precision: int = 0,
189
258
  png_path: Path | str = None,
190
259
  color_palette: str = "Plotly",
191
260
  null_label: str = "<NA>",
192
- ) -> object:
261
+ show_other: bool = False,
262
+ ) -> plotly.graph_objects:
193
263
  """
194
264
  Generates a stacked bar plot using the provided DataFrame.
195
265
 
@@ -207,7 +277,7 @@ def plot_stacked_bars(
207
277
  - title (str): Custom title for the plot.
208
278
  - renderer (Literal["png", "svg", None]): Defines the output format.
209
279
  - caption (str): Optional caption for additional context.
210
- - sort_values (bool):
280
+ - sort_values (bool):
211
281
  - If True, sorts bars by the sum of their values (descending).
212
282
  - If False, sorts bars alphabetically.
213
283
  - show_total (bool): If True, adds a row with the total sum of all categories.
@@ -215,7 +285,10 @@ def plot_stacked_bars(
215
285
  - png_path (Path | str): If specified, saves the plot as a PNG file.
216
286
  - color_palette (str): Name of the color palette to use.
217
287
  - null_label (str): Label for null values.
218
-
288
+ - show_other (bool): If True, shows the "Other" category in the legend.
289
+ - sort_values_index (bool): If True, sorts the index categories by group sum
290
+ - sort_values_color (bool): If True, sorts the columns categories by group sum
291
+
219
292
  Returns:
220
293
  - A Plotly figure object representing the stacked bar chart.
221
294
  """
@@ -226,9 +299,19 @@ def plot_stacked_bars(
226
299
  print("❌ df must have exactly 2 or 3 columns")
227
300
  return
228
301
 
229
- # * check if first 2 columns are str
230
- if list(set((df.iloc[:, [0, 1]].dtypes)))[0].kind not in ["O", "b"]:
231
- print("❌ first 2 columns must be str")
302
+ # ! do not enforce str columns anymore
303
+ # # * check if first 2 columns are str
304
+ # dtypes = set(df.iloc[:, [0, 1]].dtypes)
305
+ # dtypes_kind = [i.kind for i in dtypes]
306
+
307
+ # if set(dtypes_kind) - set(["O", "b"]):
308
+ # print("❌ first 2 columns must be str")
309
+ # # * overkill ^^
310
+ # df.iloc[:, [0, 1]] = df.iloc[:, [0, 1]].astype(str)
311
+
312
+ # * but last col must be numeric
313
+ if df.iloc[:, -1].dtype.kind not in ("f", "i"):
314
+ print("❌ last column must be numeric")
232
315
  return
233
316
 
234
317
  df = df.copy() # Copy the input DataFrame to avoid modifying the original
@@ -252,69 +335,103 @@ def plot_stacked_bars(
252
335
  # * apply precision
253
336
  df.iloc[:, 2] = df.iloc[:, 2].round(precision)
254
337
 
255
- # * set index + color col
338
+ # # * set index + color col
256
339
  col_index = df.columns[0] if not swap else df.columns[1]
257
340
  col_color = df.columns[1] if not swap else df.columns[0]
258
341
 
259
342
  # * ensure df is grouped to prevent false aggregations
260
- df = (
261
- df.groupby([df.columns[0], df.columns[1]])
262
- [df.columns[2]]
263
- .sum()
264
- .reset_index()
265
- )
266
-
267
- # * Sorting logic based on sort_values
268
- if sort_values:
269
- sort_order = (
270
- df.groupby(col_index)[df.columns[2]].sum().sort_values(ascending=False).index
271
- )
272
- else:
273
- sort_order = sorted(df[col_index].unique()) # Alphabetical order
274
- df[col_index] = pd.Categorical(df[col_index], categories=sort_order, ordered=True)
343
+ df = df.groupby([df.columns[0], df.columns[1]])[df.columns[2]].sum().reset_index()
275
344
 
276
345
  # * add total as aggregation of df
277
346
  if show_total:
278
- df_total = df.copy()
279
- df_total[col_index] = " TOTAL" # add space to make this item first
280
- df = pd.concat([df, df_total])
281
-
282
- # * Convert to categorical with explicit ordering
283
- df[col_index] = pd.Categorical(df[col_index], categories=sort_order, ordered=True)
284
-
285
- if top_n_index > 0 and len(sort_order) > top_n_index:
286
- top_categories = sort_order[:top_n_index]
287
- df[col_index] = df[col_index].apply(lambda x: x if x in top_categories else "<other>")
288
-
289
- unique_colors = sorted(df[col_color].unique())
290
- if top_n_color > 0 and len(unique_colors) > top_n_color:
291
- top_colors = unique_colors[:top_n_color]
292
- df[col_color] = df[col_color].apply(lambda x: x if x in top_colors else "<other>")
293
-
294
- column_colors = assign_column_colors(sorted(df[col_color].unique()), color_palette, null_label)
295
-
296
- # # * assign colors to columns
297
- # unique_colors = sorted(df[col_color].unique())
298
- # column_colors = assign_column_colors(unique_colors, color_palette, null_label)
347
+ df_total = df.groupby(df.columns[1], observed=True, as_index=False)[
348
+ df.columns[2]
349
+ ].sum()
350
+ df_total[df.columns[0]] = " Total"
351
+ df = pd.concat([df, df_total], ignore_index=True)
299
352
 
300
353
  # * calculate n
301
354
  divider = 2 if show_total else 1
302
- n = int(df[df.columns[2]].sum() / divider)
355
+ n = int(df.iloc[:, 2].sum() / divider)
303
356
 
304
357
  # * title str
305
358
  _title_str_top_index = f"TOP{top_n_index} " if top_n_index > 0 else ""
306
359
  _title_str_top_color = f"TOP{top_n_color} " if top_n_color > 0 else ""
307
360
  _title_str_null = f", NULL excluded" if dropna else ""
308
361
  _title_str_n = f", n={n:_}"
362
+
363
+ _df = df.copy().assign(facet=None)
364
+ _df.columns = (
365
+ ["index", "col", "value", "facet"]
366
+ if not swap
367
+ else ["col", "index", "value", "facet"]
368
+ )
369
+
370
+ aggregated_df = aggregate_data(
371
+ df=_df,
372
+ top_n_index=top_n_index,
373
+ top_n_color=top_n_color,
374
+ top_n_facet=0,
375
+ null_label=null_label,
376
+ show_other=show_other,
377
+ sort_values_index=sort_values_index,
378
+ sort_values_color=sort_values_color,
379
+ sort_values_facet=False, # just a placeholder
380
+ )
381
+
382
+ df = aggregated_df.copy()
383
+
384
+ columns = sorted(
385
+ df.groupby("col", observed=True)["value"]
386
+ .sum()
387
+ .sort_values(ascending=False)
388
+ .index.tolist()
389
+ )
390
+ column_colors = assign_column_colors(columns, color_palette, null_label)
391
+
309
392
  caption = _set_caption(caption)
310
393
 
394
+ # * after grouping add cols for pct and formatting
395
+ df["cnt_pct_only"] = df["value"].apply(lambda x: f"{(x / n) * 100:.{precision}f}%")
396
+
397
+ # * format output
398
+ df["cnt_str"] = df["value"].apply(lambda x: f"{x:_.{precision}f}")
399
+
400
+ divider2 = "<br>" if orientation == "v" else " "
401
+ df["cnt_pct_str"] = df.apply(
402
+ lambda row: f"{row['cnt_str']}{divider2}({row['cnt_pct_only']})", axis=1
403
+ )
404
+
405
+ # # # * Sorting logic based on sort_values
406
+ # if sort_values_index:
407
+ # sort_order = (
408
+ # df.groupby("index")["value"].sum().sort_values(ascending=False).index
409
+ # )
410
+ # else:
411
+ # sort_order = sorted(df["index"].unique(), reverse=False) # Alphabetical order
412
+
413
+ # display(sort_order)
414
+
415
+ # df["index"] = pd.Categorical(
416
+ # values=df["index"],
417
+ # # categories=sort_order,
418
+ # ordered=True,
419
+ # )
420
+ df = (
421
+ df.sort_values(by="index", ascending=False)
422
+ if orientation == "h"
423
+ else df.sort_values(by="index", ascending=True)
424
+ )
425
+
426
+ # display(df)
427
+
311
428
  # * plot
312
429
  fig = px.bar(
313
430
  df,
314
- x=col_index if orientation == "v" else df.columns[2],
315
- y=df.columns[2] if orientation == "v" else col_index,
316
- color=col_color,
317
- text=df.columns[2],
431
+ x="index" if orientation == "v" else "value",
432
+ y="value" if orientation == "v" else "index",
433
+ color="col",
434
+ text="cnt_pct_str" if normalize else "cnt_str",
318
435
  orientation=orientation,
319
436
  title=title
320
437
  or f"{caption}{_title_str_top_index}[{col_index}] by {_title_str_top_color}[{col_color}]{_title_str_null}{_title_str_n}",
@@ -322,13 +439,15 @@ def plot_stacked_bars(
322
439
  width=width,
323
440
  height=height,
324
441
  color_discrete_map=column_colors, # Use assigned colors
325
- category_orders={col_index: list(df[col_index].cat.categories)}, # <- Add this line
326
- # category_orders={col_index: df[col_index].categories.tolist() if isinstance(df[col_index].dtype, pd.CategoricalDtype) else sorted(df[col_index].unique())}
327
-
442
+ category_orders={
443
+ col_index: list(df["index"].cat.categories)
444
+ }, # <- Add this line
328
445
  )
329
- # * get longest bar
446
+
447
+
448
+ # * get longest bar
330
449
  bar_max = (
331
- df.groupby(col_index)[df.columns[2]].sum().sort_values(ascending=False).iloc[0]
450
+ df.groupby("index")["value"].sum().sort_values(ascending=False).iloc[0]
332
451
  * BAR_LENGTH_MULTIPLIER
333
452
  )
334
453
  # * ignore if bar mode is on
@@ -352,7 +471,7 @@ def plot_stacked_bars(
352
471
  },
353
472
  },
354
473
  )
355
-
474
+
356
475
  # * set dtick
357
476
  if orientation == "h":
358
477
  if relative:
@@ -488,7 +607,10 @@ def plot_bars(
488
607
 
489
608
  # * after grouping add cols for pct and formatting
490
609
  df["pct"] = df[df.columns[1]] / n
610
+
611
+ # * format output
491
612
  df["cnt_str"] = df[df.columns[1]].apply(lambda x: f"{x:_.{precision}f}")
613
+
492
614
  divider = "<br>" if orientation == "v" else " "
493
615
  df["cnt_pct_str"] = df.apply(
494
616
  lambda row: f"{row['cnt_str']}{divider}({row['pct']:.1%})", axis=1
@@ -669,7 +791,7 @@ def plot_histogram(
669
791
  caption (str): The caption for the plot. Default is None.
670
792
  title (str): The title of the plot. Default is None.
671
793
  png_path (Path | str, optional): The path to save the image as a png file. Defaults to None.
672
-
794
+
673
795
 
674
796
  Returns:
675
797
  plot object
@@ -721,7 +843,7 @@ def plot_histogram(
721
843
  )
722
844
 
723
845
  fig.show(renderer)
724
-
846
+
725
847
  # * save to png if path is provided
726
848
  if png_path is not None:
727
849
  fig.write_image(Path(png_path).as_posix())
@@ -991,7 +1113,7 @@ def plot_boxes(
991
1113
  points: Literal["all", "outliers", "suspectedoutliers", None] = None,
992
1114
  precision: int = 2,
993
1115
  height: int = 600,
994
- width: int = 800,
1116
+ width: int = 1200,
995
1117
  annotations: bool = True,
996
1118
  summary: bool = True,
997
1119
  title: str = None,
@@ -1018,7 +1140,7 @@ def plot_boxes(
1018
1140
  if (
1019
1141
  len(df.columns) != 2
1020
1142
  or not (
1021
- (pd.api.types.is_string_dtype(df.iloc[:, 0]))
1143
+ (pd.api.types.is_object_dtype(df.iloc[:, 0]))
1022
1144
  or (pd.api.types.is_bool_dtype(df.iloc[:, 0]))
1023
1145
  )
1024
1146
  or not pd.api.types.is_numeric_dtype(df.iloc[:, 1])
@@ -1124,7 +1246,7 @@ def plot_boxes(
1124
1246
 
1125
1247
  fig.show("png")
1126
1248
  if summary:
1127
- print_summary(df)
1249
+ print_summary(df=df, precision=precision)
1128
1250
 
1129
1251
  # * save to png if path is provided
1130
1252
  if png_path is not None:
@@ -1133,12 +1255,11 @@ def plot_boxes(
1133
1255
  return fig
1134
1256
 
1135
1257
 
1136
-
1137
1258
  def plot_facet_stacked_bars(
1138
1259
  df: pd.DataFrame,
1139
1260
  subplots_per_row: int = 4,
1140
1261
  top_n_index: int = 0,
1141
- top_n_columns: int = 0,
1262
+ top_n_color: int = 0,
1142
1263
  top_n_facet: int = 0,
1143
1264
  null_label: str = "<NA>",
1144
1265
  subplot_size: int = 300,
@@ -1148,6 +1269,12 @@ def plot_facet_stacked_bars(
1148
1269
  annotations: bool = False,
1149
1270
  precision: int = 0,
1150
1271
  png_path: Optional[Path] = None,
1272
+ show_other: bool = False,
1273
+ sort_values: bool = True,
1274
+ sort_values_index: bool = False,
1275
+ sort_values_color: bool = False,
1276
+ sort_values_facet: bool = False,
1277
+
1151
1278
  ) -> object:
1152
1279
  """
1153
1280
  Create a grid of stacked bar charts.
@@ -1156,7 +1283,7 @@ def plot_facet_stacked_bars(
1156
1283
  df (pd.DataFrame): DataFrame with 3 or 4 columns.
1157
1284
  subplots_per_row (int): Number of subplots per row.
1158
1285
  top_n_index (int): top N index values to keep.
1159
- top_n_columns (int): top N column values to keep.
1286
+ top_n_color (int): top N column values to keep.
1160
1287
  top_n_facet (int): top N facet values to keep.
1161
1288
  null_label (str): Label for null values.
1162
1289
  subplot_size (int): Size of each subplot.
@@ -1166,47 +1293,57 @@ def plot_facet_stacked_bars(
1166
1293
  annotations (bool): Whether to show annotations in the subplots.
1167
1294
  precision (int): Decimal precision for annotations.
1168
1295
  png_path (Optional[Path]): Path to save the image.
1296
+ show_other (bool): If True, adds an "<other>" bar for columns not in top_n_color.
1297
+ sort_values_index (bool): If True, sorts index by group sum.
1298
+ sort_values_color (bool): If True, sorts columns by group sum.
1299
+ sort_values_facet (bool): If True, sorts facet by group sum.
1300
+ sort_values (bool): DEPRECATED
1301
+
1169
1302
 
1170
1303
  Returns:
1171
1304
  plot object
1172
-
1305
+
1173
1306
  Remarks:
1174
1307
  If you need to include facets that have no data, fill up like this beforehand:
1175
1308
  df.loc[len(df)]=[None, None, 12]
1176
1309
  """
1177
-
1310
+
1178
1311
  df = df.copy() # Copy the input DataFrame to avoid modifying the original
1179
1312
 
1180
1313
  if not (df.shape[1] == 3 or df.shape[1] == 4):
1181
1314
  raise ValueError("Input DataFrame must have 3 or 4 columns.")
1182
-
1315
+
1183
1316
  original_column_names = df.columns.tolist()
1317
+ original_rows = len(df)
1184
1318
 
1185
1319
  if df.shape[1] == 3:
1186
- df.columns = ['index', 'col', 'facet']
1187
- df['value'] = 1
1320
+ df.columns = ["index", "col", "facet"]
1321
+ df["value"] = 1
1188
1322
  elif df.shape[1] == 4:
1189
- df.columns = ['index', 'col', 'facet', 'value']
1190
-
1191
- aggregated_df = aggregate_data(df, top_n_index, top_n_columns, top_n_facet, null_label)
1192
-
1193
- # facets = aggregated_df['facet'].unique()
1194
- facets = sorted(aggregated_df['facet'].unique()) # Ensure facets are sorted consistently
1323
+ df.columns = ["index", "col", "facet", "value"]
1195
1324
 
1196
- if top_n_columns > 0:
1197
- top_columns = aggregated_df.groupby('col', observed=True)['value'].sum().nlargest(top_n_columns).index.tolist()
1198
- # aggregated_df['col'] = aggregated_df['col'].apply(lambda x: x if x in top_columns else "<other>")
1199
- # aggregated_df['col'] = pd.Categorical(aggregated_df['col'], categories=top_columns + ["<other>"], ordered=True)
1200
- # aggregated_df['col'] = pd.Categorical(
1201
- # aggregated_df['col'].map(lambda x: x if x in top_columns else "<other>"),
1202
- # categories=top_columns + ["<other>"],
1203
- # ordered=True
1204
- # )
1205
- aggregated_df['col'] = aggregated_df['col'].apply(lambda x: x if x in top_columns else "<other>")
1325
+ aggregated_df = aggregate_data(
1326
+ df,
1327
+ top_n_index,
1328
+ top_n_color,
1329
+ top_n_facet,
1330
+ null_label,
1331
+ show_other=show_other,
1332
+ sort_values_index=sort_values_index,
1333
+ sort_values_color=sort_values_color,
1334
+ sort_values_facet=sort_values_facet,
1335
+ )
1206
1336
 
1337
+ facets = sorted(
1338
+ aggregated_df["facet"].unique()
1339
+ ) # Ensure facets are sorted consistently
1207
1340
 
1208
- # columns = sorted(aggregated_df['col'].unique())
1209
- columns = aggregated_df.groupby('col', observed=True)['value'].sum().sort_values(ascending=False).index.tolist()
1341
+ columns = sorted(
1342
+ aggregated_df.groupby("col", observed=True)["value"]
1343
+ .sum()
1344
+ .sort_values(ascending=False)
1345
+ .index.tolist()
1346
+ )
1210
1347
  column_colors = assign_column_colors(columns, color_palette, null_label)
1211
1348
 
1212
1349
  fig = make_subplots(
@@ -1215,25 +1352,39 @@ def plot_facet_stacked_bars(
1215
1352
  subplot_titles=facets,
1216
1353
  )
1217
1354
 
1355
+ # * Ensure all categories appear in the legend by adding an invisible trace
1356
+ for column in columns:
1357
+ fig.add_trace(
1358
+ go.Bar(
1359
+ x=[None], # Invisible bar
1360
+ y=[None],
1361
+ name=column,
1362
+ marker=dict(color=column_colors[column]),
1363
+ showlegend=True, # Ensure it appears in the legend
1364
+ )
1365
+ )
1366
+
1218
1367
  added_to_legend = set()
1219
1368
  for i, facet in enumerate(facets):
1220
- facet_data = aggregated_df[aggregated_df['facet'] == facet]
1369
+ facet_data = aggregated_df[aggregated_df["facet"] == facet]
1221
1370
  row = (i // subplots_per_row) + 1
1222
1371
  col = (i % subplots_per_row) + 1
1223
1372
 
1224
1373
  for column in columns:
1225
- column_data = facet_data[facet_data['col'] == column]
1374
+ column_data = facet_data[facet_data["col"] == column]
1375
+
1226
1376
  show_legend = column not in added_to_legend
1227
1377
  if show_legend:
1228
1378
  added_to_legend.add(column)
1229
1379
 
1230
1380
  fig.add_trace(
1231
1381
  go.Bar(
1232
- x=column_data['index'],
1233
- y=column_data['value'],
1382
+ x=column_data["index"],
1383
+ y=column_data["value"],
1234
1384
  name=column,
1235
1385
  marker=dict(color=column_colors[column]),
1236
- showlegend=show_legend,
1386
+ legendgroup=column, # Ensures multiple traces use the same legend entry
1387
+ showlegend=False, # suppress further legend items
1237
1388
  ),
1238
1389
  row=row,
1239
1390
  col=col,
@@ -1242,8 +1393,8 @@ def plot_facet_stacked_bars(
1242
1393
  if annotations:
1243
1394
  for _, row_data in column_data.iterrows():
1244
1395
  fig.add_annotation(
1245
- x=row_data['index'],
1246
- y=row_data['value'],
1396
+ x=row_data["index"],
1397
+ y=row_data["value"],
1247
1398
  text=f"{row_data['value']:.{precision}f}",
1248
1399
  showarrow=False,
1249
1400
  row=row,
@@ -1257,8 +1408,8 @@ def plot_facet_stacked_bars(
1257
1408
  else:
1258
1409
  axis_details.append(f"[{original_column_names[0]}]")
1259
1410
 
1260
- if top_n_columns > 0:
1261
- axis_details.append(f"TOP {top_n_columns} [{original_column_names[1]}]")
1411
+ if top_n_color > 0:
1412
+ axis_details.append(f"TOP {top_n_color} [{original_column_names[1]}]")
1262
1413
  else:
1263
1414
  axis_details.append(f"[{original_column_names[1]}]")
1264
1415
 
@@ -1267,7 +1418,7 @@ def plot_facet_stacked_bars(
1267
1418
  else:
1268
1419
  axis_details.append(f"[{original_column_names[2]}]")
1269
1420
 
1270
- title = f"{caption} {', '.join(axis_details)}, n = {unique_rows:_}"
1421
+ title = f"{caption} {', '.join(axis_details)}, n = {original_rows:_}"
1271
1422
  template = "plotly_dark" if os.getenv("THEME") == "dark" else "plotly"
1272
1423
  fig.update_layout(
1273
1424
  title=title,
pandas_plots/tbl.py CHANGED
@@ -15,7 +15,7 @@ from plotly.subplots import make_subplots
15
15
  from scipy import stats
16
16
  import dataframe_image as dfi
17
17
 
18
- from .hlp import wrap_text
18
+ from .hlp import wrap_text, to_series
19
19
 
20
20
  import duckdb as ddb
21
21
 
@@ -696,7 +696,7 @@ def show_num_df(
696
696
 
697
697
 
698
698
 
699
- def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="🟠 "):
699
+ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str=" ", precision: int=3):
700
700
  """
701
701
  Print statistical summary for a pandas DataFrame or Series.
702
702
 
@@ -712,15 +712,44 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
712
712
  df (Union[pd.DataFrame, pd.Series]): Input DataFrame or Series. Only numeric columns
713
713
  in DataFrame are considered.
714
714
  show (bool, optional): Whether to print the summary. Defaults to True.
715
- name (str, optional): Prefix for the summary. Defaults to "🟠 "
715
+ name (str, optional): Prefix for the summary. Defaults to " ".
716
+ precision (int, optional): Number of digits to round the results to. Defaults to 3.
716
717
  """
717
718
  if df.empty:
718
719
  return
719
720
 
720
721
  # * drop NA to keep scipy sane
721
- df = df.dropna().copy()
722
+ df = df.dropna().copy()
722
723
 
723
- def print_summary_ser(ser: pd.Series, show: bool=True, name: str=""):
724
+ # display(df)
725
+
726
+ if len(df.columns) == 1:
727
+ df = df.to_series()
728
+
729
+ pd.api.types.is_numeric_dtype(df)
730
+
731
+
732
+ if not (
733
+ # * series must be numeric
734
+ (isinstance(df, pd.Series)
735
+ and pd.api.types.is_numeric_dtype(df)
736
+ )
737
+ or
738
+ # * df must have 2 columns str num
739
+ (len(df.columns) == 2
740
+ and (
741
+ (pd.api.types.is_object_dtype(df.iloc[:, 0]))
742
+ or (pd.api.types.is_bool_dtype(df.iloc[:, 0]))
743
+ )
744
+ and pd.api.types.is_numeric_dtype(df.iloc[:, 1])
745
+ )
746
+ ):
747
+ print(f"❌ df must have 2 columns: [0] str or bool, [1] num, or be a series")
748
+ return
749
+
750
+
751
+
752
+ def print_summary_ser(ser: pd.Series, show: bool=True, name: str="", precision: int=3):
724
753
  # Calculate IQR and pass `rng=(25, 75)` to get the interquartile range
725
754
  iqr_value = stats.iqr(ser)
726
755
 
@@ -728,21 +757,21 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
728
757
  # ser.dropna(inplace=True)
729
758
 
730
759
  # Using the iqr function, we still calculate the bounds manually
731
- q1 = stats.scoreatpercentile(ser, 25)
732
- q3 = stats.scoreatpercentile(ser, 75)
760
+ q1 = round(stats.scoreatpercentile(ser, 25), precision)
761
+ q3 = round(stats.scoreatpercentile(ser, 75), precision)
733
762
 
734
763
  # Calculate upper bound directly
735
- min = round(ser.min(),3)
736
- med = round(ser.median(),3)
737
- upper = round(q3 + 1.5 * iqr_value,3)
738
- lower = round(q1 - 1.5 * iqr_value,3)
739
- mean = round(ser.mean(),3)
740
- std = round(ser.std(),3)
741
- cv = round(ser.std() / ser.mean(),3)
742
- max = round(ser.max(),3)
743
- sum = round(ser.sum(),3)
744
- skew = round(stats.skew(ser.dropna().tolist()),3)
745
- kurto = round(stats.kurtosis(ser.dropna().tolist()),3)
764
+ min = round(ser.min(), precision)
765
+ med = round(ser.median(), precision)
766
+ upper = round(q3 + 1.5 * iqr_value, precision)
767
+ lower = round(q1 - 1.5 * iqr_value, precision)
768
+ mean = round(ser.mean(), precision)
769
+ std = round(ser.std(), precision)
770
+ cv = round(ser.std() / ser.mean(), precision)
771
+ max = round(ser.max(), precision)
772
+ sum = round(ser.sum(), precision)
773
+ skew = round(stats.skew(ser.dropna().tolist()), precision)
774
+ kurto = round(stats.kurtosis(ser.dropna().tolist()), precision)
746
775
 
747
776
  lower = min if lower < min else lower
748
777
  upper = max if upper > max else upper
@@ -750,7 +779,7 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
750
779
  # * extra care for scipy metrics, these are very vulnarable to nan
751
780
  if show:
752
781
  print(
753
- f"""{name} min: {min:_} | lower: {lower:_} | q25: {q1:_} | median: {med:_} | mean: {mean:_} | q75: {q3:_} | upper: {upper:_} | max: {max:_} | std: {std:_} | cv: {cv:_} | sum: {sum:_} | skew: {skew} | kurto: {kurto}""")
782
+ f"""{name} -> min: {min:_} | lower: {lower:_} | q25: {q1:_} | median: {med:_} | mean: {mean:_} | q75: {q3:_} | upper: {upper:_} | max: {max:_} | std: {std:_} | cv: {cv:_} | sum: {sum:_} | skew: {skew} | kurto: {kurto}""")
754
783
 
755
784
  summary = {
756
785
  "min": min,
@@ -770,11 +799,22 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
770
799
  return summary
771
800
 
772
801
  if isinstance(df, pd.Series):
773
- return print_summary_ser(df, show=show, name=name)
802
+ # * print serie
803
+ name = df.name if df.name else "series"
804
+ print_summary_ser(ser=df, show=show, name=name, precision=precision)
805
+ return
774
806
 
775
807
  if isinstance(df, pd.DataFrame):
776
- # * only show numerics
777
- for col in df.select_dtypes("number").columns:
778
- summary = print_summary_ser(ser=df[col],show=show, name=col)
808
+ # * print for all values
809
+ print(f"🟧 all data")
810
+ name = df.columns[-1]
811
+ summary = print_summary_ser(ser=df.iloc[:,1], show=show, name=name, precision=precision)
812
+
813
+ print(f"🟧 boxes")
814
+ # * print for each value
815
+ for item in df.iloc[:,0].unique():
816
+ # display(df[df.iloc[:,0] == item])
817
+ print_summary_ser(ser=df[df.iloc[:,0] == item].iloc[:,1], show=show, name=item, precision=precision)
779
818
 
780
819
  return summary
820
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: pandas-plots
3
- Version: 0.12.6
3
+ Version: 0.12.8
4
4
  Summary: A collection of helper for table handling and visualization
5
5
  Home-page: https://github.com/smeisegeier/pandas-plots
6
6
  Author: smeisegeier
@@ -0,0 +1,11 @@
1
+ pandas_plots/hlp.py,sha256=N6NrbFagVMMX-ZnV0rIBEz82SeSoOkksfMcCap55W7E,16588
2
+ pandas_plots/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
3
+ pandas_plots/pls.py,sha256=8wqdoE8hXr1nwQH1Q4KelAso49txb-CyVwVKoqt_xeY,47422
4
+ pandas_plots/tbl.py,sha256=tuTDRFaD4lKQ2fMeMCJwnJL65zXuUGVQ6uwQNVa0y6Q,31883
5
+ pandas_plots/ven.py,sha256=2x3ACo2vSfO3q6fv-UdDQ0h1SJyt8WChBGgE5SDCdCk,11673
6
+ pandas_plots-0.12.8.dist-info/LICENSE,sha256=6KQ5KVAAhRaB-JJKpX4cefKvRZRgI7GUPc92_2d31XY,1051
7
+ pandas_plots-0.12.8.dist-info/METADATA,sha256=ARFgg-_KdNUg5q0qa4Zhh7o12Bh2VsTWeh-45hHO0D0,7358
8
+ pandas_plots-0.12.8.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
9
+ pandas_plots-0.12.8.dist-info/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
10
+ pandas_plots-0.12.8.dist-info/top_level.txt,sha256=XnaNuIHBqMmCeh_U7nKOYTwFue_SIA0wxuDgdPmnnSk,13
11
+ pandas_plots-0.12.8.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- pandas_plots/hlp.py,sha256=N6NrbFagVMMX-ZnV0rIBEz82SeSoOkksfMcCap55W7E,16588
2
- pandas_plots/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
3
- pandas_plots/pls.py,sha256=DsFnWbGNmMnZ8a2qnZFwXH6VekwPFaIwZEQ9TVp6xCg,43997
4
- pandas_plots/tbl.py,sha256=4VvjLisPT1gSvgsLClcrhC7LIJ-_FPNla8nomGflGag,30509
5
- pandas_plots/ven.py,sha256=2x3ACo2vSfO3q6fv-UdDQ0h1SJyt8WChBGgE5SDCdCk,11673
6
- pandas_plots-0.12.6.dist-info/LICENSE,sha256=6KQ5KVAAhRaB-JJKpX4cefKvRZRgI7GUPc92_2d31XY,1051
7
- pandas_plots-0.12.6.dist-info/METADATA,sha256=-mCMgoWTwG6HSL8JtuYvwM1LCkzglJm3aIocaUMijO4,7358
8
- pandas_plots-0.12.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
9
- pandas_plots-0.12.6.dist-info/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
10
- pandas_plots-0.12.6.dist-info/top_level.txt,sha256=XnaNuIHBqMmCeh_U7nKOYTwFue_SIA0wxuDgdPmnnSk,13
11
- pandas_plots-0.12.6.dist-info/RECORD,,