pandas-plots 0.12.6__py3-none-any.whl → 0.12.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pandas_plots/pls.py CHANGED
@@ -12,6 +12,7 @@ from matplotlib import pyplot as plt
12
12
  from plotly import express as px
13
13
  import plotly.graph_objects as go
14
14
  from plotly.subplots import make_subplots
15
+ import plotly # needed for return types
15
16
 
16
17
  from .hlp import *
17
18
  from .tbl import print_summary
@@ -189,7 +190,7 @@ def plot_stacked_bars(
189
190
  png_path: Path | str = None,
190
191
  color_palette: str = "Plotly",
191
192
  null_label: str = "<NA>",
192
- ) -> object:
193
+ ) -> plotly.graph_objects:
193
194
  """
194
195
  Generates a stacked bar plot using the provided DataFrame.
195
196
 
@@ -220,7 +221,7 @@ def plot_stacked_bars(
220
221
  - A Plotly figure object representing the stacked bar chart.
221
222
  """
222
223
  BAR_LENGTH_MULTIPLIER = 1.05
223
-
224
+
224
225
  # * 2 axis means at least 2 columns
225
226
  if len(df.columns) < 2 or len(df.columns) > 3:
226
227
  print("❌ df must have exactly 2 or 3 columns")
@@ -263,39 +264,47 @@ def plot_stacked_bars(
263
264
  .sum()
264
265
  .reset_index()
265
266
  )
267
+
268
+ # * add total as aggregation of df
269
+ if show_total:
270
+ df_total = df.groupby(df.columns[1], observed=True, as_index=False)[df.columns[2]].sum()
271
+ df_total[df.columns[0]] = " Total"
272
+ df = pd.concat([df, df_total], ignore_index=True)
273
+
274
+
275
+ # * apply top_n, reduce df
276
+ n_col = top_n_color if top_n_color > 0 else None
277
+ n_idx = top_n_index if top_n_index > 0 else None
278
+
279
+ unique_colors = sorted(
280
+ df.groupby(col_color)[df.columns[2]]
281
+ .sum()
282
+ .sort_values(ascending=False)
283
+ .index.tolist()[:n_col]
284
+ )
266
285
 
267
- # * Sorting logic based on sort_values
286
+ unique_idx = df[col_index].sort_values().unique()[:n_idx]
287
+
288
+ df = df[df[col_color].isin(unique_colors)]#.sort_values(by=[col_index, col_color])
289
+ df = df[df[col_index].isin(unique_idx)]#.sort_values(by=[col_index, col_color])
290
+
291
+
292
+ # # * Sorting logic based on sort_values
268
293
  if sort_values:
269
294
  sort_order = (
270
295
  df.groupby(col_index)[df.columns[2]].sum().sort_values(ascending=False).index
271
296
  )
272
297
  else:
273
298
  sort_order = sorted(df[col_index].unique()) # Alphabetical order
274
- df[col_index] = pd.Categorical(df[col_index], categories=sort_order, ordered=True)
275
299
 
276
- # * add total as aggregation of df
277
- if show_total:
278
- df_total = df.copy()
279
- df_total[col_index] = " TOTAL" # add space to make this item first
280
- df = pd.concat([df, df_total])
281
-
282
- # * Convert to categorical with explicit ordering
300
+ # # * Convert to categorical with explicit ordering
283
301
  df[col_index] = pd.Categorical(df[col_index], categories=sort_order, ordered=True)
284
302
 
285
- if top_n_index > 0 and len(sort_order) > top_n_index:
286
- top_categories = sort_order[:top_n_index]
287
- df[col_index] = df[col_index].apply(lambda x: x if x in top_categories else "<other>")
288
-
289
- unique_colors = sorted(df[col_color].unique())
290
- if top_n_color > 0 and len(unique_colors) > top_n_color:
291
- top_colors = unique_colors[:top_n_color]
292
- df[col_color] = df[col_color].apply(lambda x: x if x in top_colors else "<other>")
293
-
294
- column_colors = assign_column_colors(sorted(df[col_color].unique()), color_palette, null_label)
295
-
296
- # # * assign colors to columns
297
- # unique_colors = sorted(df[col_color].unique())
298
- # column_colors = assign_column_colors(unique_colors, color_palette, null_label)
303
+ column_colors = assign_column_colors(
304
+ columns=unique_colors,
305
+ color_palette=color_palette,
306
+ null_label=null_label
307
+ )
299
308
 
300
309
  # * calculate n
301
310
  divider = 2 if show_total else 1
@@ -308,13 +317,24 @@ def plot_stacked_bars(
308
317
  _title_str_n = f", n={n:_}"
309
318
  caption = _set_caption(caption)
310
319
 
320
+ # * after grouping add cols for pct and formatting
321
+ df["pct"] = df[df.columns[2]].apply(lambda x: f"{(x / n) * 100:.{precision}f}%")
322
+
323
+ # * format output
324
+ df["cnt_str"] = df[df.columns[2]].apply(lambda x: f"{x:_.{precision}f}")
325
+
326
+ divider2 = "<br>" if orientation == "v" else " "
327
+ df["cnt_pct_str"] = df.apply(
328
+ lambda row: f"{row['cnt_str']}{divider2}({row['pct']})", axis=1
329
+ )
330
+
311
331
  # * plot
312
332
  fig = px.bar(
313
333
  df,
314
334
  x=col_index if orientation == "v" else df.columns[2],
315
335
  y=df.columns[2] if orientation == "v" else col_index,
316
336
  color=col_color,
317
- text=df.columns[2],
337
+ text="cnt_pct_str" if normalize else "cnt_str",
318
338
  orientation=orientation,
319
339
  title=title
320
340
  or f"{caption}{_title_str_top_index}[{col_index}] by {_title_str_top_color}[{col_color}]{_title_str_null}{_title_str_n}",
@@ -323,9 +343,9 @@ def plot_stacked_bars(
323
343
  height=height,
324
344
  color_discrete_map=column_colors, # Use assigned colors
325
345
  category_orders={col_index: list(df[col_index].cat.categories)}, # <- Add this line
326
- # category_orders={col_index: df[col_index].categories.tolist() if isinstance(df[col_index].dtype, pd.CategoricalDtype) else sorted(df[col_index].unique())}
327
346
 
328
347
  )
348
+
329
349
  # * get longest bar
330
350
  bar_max = (
331
351
  df.groupby(col_index)[df.columns[2]].sum().sort_values(ascending=False).iloc[0]
@@ -488,7 +508,10 @@ def plot_bars(
488
508
 
489
509
  # * after grouping add cols for pct and formatting
490
510
  df["pct"] = df[df.columns[1]] / n
511
+
512
+ # * format output
491
513
  df["cnt_str"] = df[df.columns[1]].apply(lambda x: f"{x:_.{precision}f}")
514
+
492
515
  divider = "<br>" if orientation == "v" else " "
493
516
  df["cnt_pct_str"] = df.apply(
494
517
  lambda row: f"{row['cnt_str']}{divider}({row['pct']:.1%})", axis=1
@@ -991,7 +1014,7 @@ def plot_boxes(
991
1014
  points: Literal["all", "outliers", "suspectedoutliers", None] = None,
992
1015
  precision: int = 2,
993
1016
  height: int = 600,
994
- width: int = 800,
1017
+ width: int = 1200,
995
1018
  annotations: bool = True,
996
1019
  summary: bool = True,
997
1020
  title: str = None,
@@ -1018,7 +1041,7 @@ def plot_boxes(
1018
1041
  if (
1019
1042
  len(df.columns) != 2
1020
1043
  or not (
1021
- (pd.api.types.is_string_dtype(df.iloc[:, 0]))
1044
+ (pd.api.types.is_object_dtype(df.iloc[:, 0]))
1022
1045
  or (pd.api.types.is_bool_dtype(df.iloc[:, 0]))
1023
1046
  )
1024
1047
  or not pd.api.types.is_numeric_dtype(df.iloc[:, 1])
@@ -1124,7 +1147,7 @@ def plot_boxes(
1124
1147
 
1125
1148
  fig.show("png")
1126
1149
  if summary:
1127
- print_summary(df)
1150
+ print_summary(df=df, precision=precision)
1128
1151
 
1129
1152
  # * save to png if path is provided
1130
1153
  if png_path is not None:
pandas_plots/tbl.py CHANGED
@@ -15,7 +15,7 @@ from plotly.subplots import make_subplots
15
15
  from scipy import stats
16
16
  import dataframe_image as dfi
17
17
 
18
- from .hlp import wrap_text
18
+ from .hlp import wrap_text, to_series
19
19
 
20
20
  import duckdb as ddb
21
21
 
@@ -696,7 +696,7 @@ def show_num_df(
696
696
 
697
697
 
698
698
 
699
- def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="🟠 "):
699
+ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str=" ", precision: int=3):
700
700
  """
701
701
  Print statistical summary for a pandas DataFrame or Series.
702
702
 
@@ -712,15 +712,44 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
712
712
  df (Union[pd.DataFrame, pd.Series]): Input DataFrame or Series. Only numeric columns
713
713
  in DataFrame are considered.
714
714
  show (bool, optional): Whether to print the summary. Defaults to True.
715
- name (str, optional): Prefix for the summary. Defaults to "🟠 "
715
+ name (str, optional): Prefix for the summary. Defaults to " ".
716
+ precision (int, optional): Number of digits to round the results to. Defaults to 3.
716
717
  """
717
718
  if df.empty:
718
719
  return
719
720
 
720
721
  # * drop NA to keep scipy sane
721
- df = df.dropna().copy()
722
+ df = df.dropna().copy()
722
723
 
723
- def print_summary_ser(ser: pd.Series, show: bool=True, name: str=""):
724
+ # display(df)
725
+
726
+ if len(df.columns) == 1:
727
+ df = df.to_series()
728
+
729
+ pd.api.types.is_numeric_dtype(df)
730
+
731
+
732
+ if not (
733
+ # * series must be numeric
734
+ (isinstance(df, pd.Series)
735
+ and pd.api.types.is_numeric_dtype(df)
736
+ )
737
+ or
738
+ # * df must have 2 columns str num
739
+ (len(df.columns) == 2
740
+ and (
741
+ (pd.api.types.is_object_dtype(df.iloc[:, 0]))
742
+ or (pd.api.types.is_bool_dtype(df.iloc[:, 0]))
743
+ )
744
+ and pd.api.types.is_numeric_dtype(df.iloc[:, 1])
745
+ )
746
+ ):
747
+ print(f"❌ df must have 2 columns: [0] str or bool, [1] num, or be a series")
748
+ return
749
+
750
+
751
+
752
+ def print_summary_ser(ser: pd.Series, show: bool=True, name: str="", precision: int=3):
724
753
  # Calculate IQR and pass `rng=(25, 75)` to get the interquartile range
725
754
  iqr_value = stats.iqr(ser)
726
755
 
@@ -728,21 +757,21 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
728
757
  # ser.dropna(inplace=True)
729
758
 
730
759
  # Using the iqr function, we still calculate the bounds manually
731
- q1 = stats.scoreatpercentile(ser, 25)
732
- q3 = stats.scoreatpercentile(ser, 75)
760
+ q1 = round(stats.scoreatpercentile(ser, 25), precision)
761
+ q3 = round(stats.scoreatpercentile(ser, 75), precision)
733
762
 
734
763
  # Calculate upper bound directly
735
- min = round(ser.min(),3)
736
- med = round(ser.median(),3)
737
- upper = round(q3 + 1.5 * iqr_value,3)
738
- lower = round(q1 - 1.5 * iqr_value,3)
739
- mean = round(ser.mean(),3)
740
- std = round(ser.std(),3)
741
- cv = round(ser.std() / ser.mean(),3)
742
- max = round(ser.max(),3)
743
- sum = round(ser.sum(),3)
744
- skew = round(stats.skew(ser.dropna().tolist()),3)
745
- kurto = round(stats.kurtosis(ser.dropna().tolist()),3)
764
+ min = round(ser.min(), precision)
765
+ med = round(ser.median(), precision)
766
+ upper = round(q3 + 1.5 * iqr_value, precision)
767
+ lower = round(q1 - 1.5 * iqr_value, precision)
768
+ mean = round(ser.mean(), precision)
769
+ std = round(ser.std(), precision)
770
+ cv = round(ser.std() / ser.mean(), precision)
771
+ max = round(ser.max(), precision)
772
+ sum = round(ser.sum(), precision)
773
+ skew = round(stats.skew(ser.dropna().tolist()), precision)
774
+ kurto = round(stats.kurtosis(ser.dropna().tolist()), precision)
746
775
 
747
776
  lower = min if lower < min else lower
748
777
  upper = max if upper > max else upper
@@ -750,7 +779,7 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
750
779
  # * extra care for scipy metrics, these are very vulnarable to nan
751
780
  if show:
752
781
  print(
753
- f"""{name} min: {min:_} | lower: {lower:_} | q25: {q1:_} | median: {med:_} | mean: {mean:_} | q75: {q3:_} | upper: {upper:_} | max: {max:_} | std: {std:_} | cv: {cv:_} | sum: {sum:_} | skew: {skew} | kurto: {kurto}""")
782
+ f"""{name} -> min: {min:_} | lower: {lower:_} | q25: {q1:_} | median: {med:_} | mean: {mean:_} | q75: {q3:_} | upper: {upper:_} | max: {max:_} | std: {std:_} | cv: {cv:_} | sum: {sum:_} | skew: {skew} | kurto: {kurto}""")
754
783
 
755
784
  summary = {
756
785
  "min": min,
@@ -770,11 +799,22 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
770
799
  return summary
771
800
 
772
801
  if isinstance(df, pd.Series):
773
- return print_summary_ser(df, show=show, name=name)
802
+ # * print serie
803
+ name = df.name if df.name else "series"
804
+ print_summary_ser(ser=df, show=show, name=name, precision=precision)
805
+ return
774
806
 
775
807
  if isinstance(df, pd.DataFrame):
776
- # * only show numerics
777
- for col in df.select_dtypes("number").columns:
778
- summary = print_summary_ser(ser=df[col],show=show, name=col)
808
+ # * print for all values
809
+ print(f"🟧 all data")
810
+ name = df.columns[-1]
811
+ summary = print_summary_ser(ser=df.iloc[:,1], show=show, name=name, precision=precision)
812
+
813
+ print(f"🟧 boxes")
814
+ # * print for each value
815
+ for item in df.iloc[:,0].unique():
816
+ # display(df[df.iloc[:,0] == item])
817
+ print_summary_ser(ser=df[df.iloc[:,0] == item].iloc[:,1], show=show, name=item, precision=precision)
779
818
 
780
819
  return summary
820
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: pandas-plots
3
- Version: 0.12.6
3
+ Version: 0.12.7
4
4
  Summary: A collection of helper for table handling and visualization
5
5
  Home-page: https://github.com/smeisegeier/pandas-plots
6
6
  Author: smeisegeier
@@ -0,0 +1,11 @@
1
+ pandas_plots/hlp.py,sha256=N6NrbFagVMMX-ZnV0rIBEz82SeSoOkksfMcCap55W7E,16588
2
+ pandas_plots/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
3
+ pandas_plots/pls.py,sha256=WpQ8hPmp8MbHvgEvSejDYFXyY_hZabLY4OLW8S6u15g,44310
4
+ pandas_plots/tbl.py,sha256=tuTDRFaD4lKQ2fMeMCJwnJL65zXuUGVQ6uwQNVa0y6Q,31883
5
+ pandas_plots/ven.py,sha256=2x3ACo2vSfO3q6fv-UdDQ0h1SJyt8WChBGgE5SDCdCk,11673
6
+ pandas_plots-0.12.7.dist-info/LICENSE,sha256=6KQ5KVAAhRaB-JJKpX4cefKvRZRgI7GUPc92_2d31XY,1051
7
+ pandas_plots-0.12.7.dist-info/METADATA,sha256=9i_TsDQinaUPz9eqJO7a0L4JFZmmM3l_WzoPbmDPH0Y,7358
8
+ pandas_plots-0.12.7.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
9
+ pandas_plots-0.12.7.dist-info/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
10
+ pandas_plots-0.12.7.dist-info/top_level.txt,sha256=XnaNuIHBqMmCeh_U7nKOYTwFue_SIA0wxuDgdPmnnSk,13
11
+ pandas_plots-0.12.7.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- pandas_plots/hlp.py,sha256=N6NrbFagVMMX-ZnV0rIBEz82SeSoOkksfMcCap55W7E,16588
2
- pandas_plots/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
3
- pandas_plots/pls.py,sha256=DsFnWbGNmMnZ8a2qnZFwXH6VekwPFaIwZEQ9TVp6xCg,43997
4
- pandas_plots/tbl.py,sha256=4VvjLisPT1gSvgsLClcrhC7LIJ-_FPNla8nomGflGag,30509
5
- pandas_plots/ven.py,sha256=2x3ACo2vSfO3q6fv-UdDQ0h1SJyt8WChBGgE5SDCdCk,11673
6
- pandas_plots-0.12.6.dist-info/LICENSE,sha256=6KQ5KVAAhRaB-JJKpX4cefKvRZRgI7GUPc92_2d31XY,1051
7
- pandas_plots-0.12.6.dist-info/METADATA,sha256=-mCMgoWTwG6HSL8JtuYvwM1LCkzglJm3aIocaUMijO4,7358
8
- pandas_plots-0.12.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
9
- pandas_plots-0.12.6.dist-info/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
10
- pandas_plots-0.12.6.dist-info/top_level.txt,sha256=XnaNuIHBqMmCeh_U7nKOYTwFue_SIA0wxuDgdPmnnSk,13
11
- pandas_plots-0.12.6.dist-info/RECORD,,