pandas-plots 0.12.6__py3-none-any.whl → 0.12.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pandas_plots/pls.py +53 -30
- pandas_plots/tbl.py +63 -23
- {pandas_plots-0.12.6.dist-info → pandas_plots-0.12.7.dist-info}/METADATA +1 -1
- pandas_plots-0.12.7.dist-info/RECORD +11 -0
- pandas_plots-0.12.6.dist-info/RECORD +0 -11
- {pandas_plots-0.12.6.dist-info → pandas_plots-0.12.7.dist-info}/LICENSE +0 -0
- {pandas_plots-0.12.6.dist-info → pandas_plots-0.12.7.dist-info}/WHEEL +0 -0
- {pandas_plots-0.12.6.dist-info → pandas_plots-0.12.7.dist-info}/pii.py +0 -0
- {pandas_plots-0.12.6.dist-info → pandas_plots-0.12.7.dist-info}/top_level.txt +0 -0
pandas_plots/pls.py
CHANGED
@@ -12,6 +12,7 @@ from matplotlib import pyplot as plt
|
|
12
12
|
from plotly import express as px
|
13
13
|
import plotly.graph_objects as go
|
14
14
|
from plotly.subplots import make_subplots
|
15
|
+
import plotly # needed for return types
|
15
16
|
|
16
17
|
from .hlp import *
|
17
18
|
from .tbl import print_summary
|
@@ -189,7 +190,7 @@ def plot_stacked_bars(
|
|
189
190
|
png_path: Path | str = None,
|
190
191
|
color_palette: str = "Plotly",
|
191
192
|
null_label: str = "<NA>",
|
192
|
-
) ->
|
193
|
+
) -> plotly.graph_objects:
|
193
194
|
"""
|
194
195
|
Generates a stacked bar plot using the provided DataFrame.
|
195
196
|
|
@@ -220,7 +221,7 @@ def plot_stacked_bars(
|
|
220
221
|
- A Plotly figure object representing the stacked bar chart.
|
221
222
|
"""
|
222
223
|
BAR_LENGTH_MULTIPLIER = 1.05
|
223
|
-
|
224
|
+
|
224
225
|
# * 2 axis means at least 2 columns
|
225
226
|
if len(df.columns) < 2 or len(df.columns) > 3:
|
226
227
|
print("❌ df must have exactly 2 or 3 columns")
|
@@ -263,39 +264,47 @@ def plot_stacked_bars(
|
|
263
264
|
.sum()
|
264
265
|
.reset_index()
|
265
266
|
)
|
267
|
+
|
268
|
+
# * add total as aggregation of df
|
269
|
+
if show_total:
|
270
|
+
df_total = df.groupby(df.columns[1], observed=True, as_index=False)[df.columns[2]].sum()
|
271
|
+
df_total[df.columns[0]] = " Total"
|
272
|
+
df = pd.concat([df, df_total], ignore_index=True)
|
273
|
+
|
274
|
+
|
275
|
+
# * apply top_n, reduce df
|
276
|
+
n_col = top_n_color if top_n_color > 0 else None
|
277
|
+
n_idx = top_n_index if top_n_index > 0 else None
|
278
|
+
|
279
|
+
unique_colors = sorted(
|
280
|
+
df.groupby(col_color)[df.columns[2]]
|
281
|
+
.sum()
|
282
|
+
.sort_values(ascending=False)
|
283
|
+
.index.tolist()[:n_col]
|
284
|
+
)
|
266
285
|
|
267
|
-
|
286
|
+
unique_idx = df[col_index].sort_values().unique()[:n_idx]
|
287
|
+
|
288
|
+
df = df[df[col_color].isin(unique_colors)]#.sort_values(by=[col_index, col_color])
|
289
|
+
df = df[df[col_index].isin(unique_idx)]#.sort_values(by=[col_index, col_color])
|
290
|
+
|
291
|
+
|
292
|
+
# # * Sorting logic based on sort_values
|
268
293
|
if sort_values:
|
269
294
|
sort_order = (
|
270
295
|
df.groupby(col_index)[df.columns[2]].sum().sort_values(ascending=False).index
|
271
296
|
)
|
272
297
|
else:
|
273
298
|
sort_order = sorted(df[col_index].unique()) # Alphabetical order
|
274
|
-
df[col_index] = pd.Categorical(df[col_index], categories=sort_order, ordered=True)
|
275
299
|
|
276
|
-
# *
|
277
|
-
if show_total:
|
278
|
-
df_total = df.copy()
|
279
|
-
df_total[col_index] = " TOTAL" # add space to make this item first
|
280
|
-
df = pd.concat([df, df_total])
|
281
|
-
|
282
|
-
# * Convert to categorical with explicit ordering
|
300
|
+
# # * Convert to categorical with explicit ordering
|
283
301
|
df[col_index] = pd.Categorical(df[col_index], categories=sort_order, ordered=True)
|
284
302
|
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
if top_n_color > 0 and len(unique_colors) > top_n_color:
|
291
|
-
top_colors = unique_colors[:top_n_color]
|
292
|
-
df[col_color] = df[col_color].apply(lambda x: x if x in top_colors else "<other>")
|
293
|
-
|
294
|
-
column_colors = assign_column_colors(sorted(df[col_color].unique()), color_palette, null_label)
|
295
|
-
|
296
|
-
# # * assign colors to columns
|
297
|
-
# unique_colors = sorted(df[col_color].unique())
|
298
|
-
# column_colors = assign_column_colors(unique_colors, color_palette, null_label)
|
303
|
+
column_colors = assign_column_colors(
|
304
|
+
columns=unique_colors,
|
305
|
+
color_palette=color_palette,
|
306
|
+
null_label=null_label
|
307
|
+
)
|
299
308
|
|
300
309
|
# * calculate n
|
301
310
|
divider = 2 if show_total else 1
|
@@ -308,13 +317,24 @@ def plot_stacked_bars(
|
|
308
317
|
_title_str_n = f", n={n:_}"
|
309
318
|
caption = _set_caption(caption)
|
310
319
|
|
320
|
+
# * after grouping add cols for pct and formatting
|
321
|
+
df["pct"] = df[df.columns[2]].apply(lambda x: f"{(x / n) * 100:.{precision}f}%")
|
322
|
+
|
323
|
+
# * format output
|
324
|
+
df["cnt_str"] = df[df.columns[2]].apply(lambda x: f"{x:_.{precision}f}")
|
325
|
+
|
326
|
+
divider2 = "<br>" if orientation == "v" else " "
|
327
|
+
df["cnt_pct_str"] = df.apply(
|
328
|
+
lambda row: f"{row['cnt_str']}{divider2}({row['pct']})", axis=1
|
329
|
+
)
|
330
|
+
|
311
331
|
# * plot
|
312
332
|
fig = px.bar(
|
313
333
|
df,
|
314
334
|
x=col_index if orientation == "v" else df.columns[2],
|
315
335
|
y=df.columns[2] if orientation == "v" else col_index,
|
316
336
|
color=col_color,
|
317
|
-
text=
|
337
|
+
text="cnt_pct_str" if normalize else "cnt_str",
|
318
338
|
orientation=orientation,
|
319
339
|
title=title
|
320
340
|
or f"{caption}{_title_str_top_index}[{col_index}] by {_title_str_top_color}[{col_color}]{_title_str_null}{_title_str_n}",
|
@@ -323,9 +343,9 @@ def plot_stacked_bars(
|
|
323
343
|
height=height,
|
324
344
|
color_discrete_map=column_colors, # Use assigned colors
|
325
345
|
category_orders={col_index: list(df[col_index].cat.categories)}, # <- Add this line
|
326
|
-
# category_orders={col_index: df[col_index].categories.tolist() if isinstance(df[col_index].dtype, pd.CategoricalDtype) else sorted(df[col_index].unique())}
|
327
346
|
|
328
347
|
)
|
348
|
+
|
329
349
|
# * get longest bar
|
330
350
|
bar_max = (
|
331
351
|
df.groupby(col_index)[df.columns[2]].sum().sort_values(ascending=False).iloc[0]
|
@@ -488,7 +508,10 @@ def plot_bars(
|
|
488
508
|
|
489
509
|
# * after grouping add cols for pct and formatting
|
490
510
|
df["pct"] = df[df.columns[1]] / n
|
511
|
+
|
512
|
+
# * format output
|
491
513
|
df["cnt_str"] = df[df.columns[1]].apply(lambda x: f"{x:_.{precision}f}")
|
514
|
+
|
492
515
|
divider = "<br>" if orientation == "v" else " "
|
493
516
|
df["cnt_pct_str"] = df.apply(
|
494
517
|
lambda row: f"{row['cnt_str']}{divider}({row['pct']:.1%})", axis=1
|
@@ -991,7 +1014,7 @@ def plot_boxes(
|
|
991
1014
|
points: Literal["all", "outliers", "suspectedoutliers", None] = None,
|
992
1015
|
precision: int = 2,
|
993
1016
|
height: int = 600,
|
994
|
-
width: int =
|
1017
|
+
width: int = 1200,
|
995
1018
|
annotations: bool = True,
|
996
1019
|
summary: bool = True,
|
997
1020
|
title: str = None,
|
@@ -1018,7 +1041,7 @@ def plot_boxes(
|
|
1018
1041
|
if (
|
1019
1042
|
len(df.columns) != 2
|
1020
1043
|
or not (
|
1021
|
-
(pd.api.types.
|
1044
|
+
(pd.api.types.is_object_dtype(df.iloc[:, 0]))
|
1022
1045
|
or (pd.api.types.is_bool_dtype(df.iloc[:, 0]))
|
1023
1046
|
)
|
1024
1047
|
or not pd.api.types.is_numeric_dtype(df.iloc[:, 1])
|
@@ -1124,7 +1147,7 @@ def plot_boxes(
|
|
1124
1147
|
|
1125
1148
|
fig.show("png")
|
1126
1149
|
if summary:
|
1127
|
-
print_summary(df)
|
1150
|
+
print_summary(df=df, precision=precision)
|
1128
1151
|
|
1129
1152
|
# * save to png if path is provided
|
1130
1153
|
if png_path is not None:
|
pandas_plots/tbl.py
CHANGED
@@ -15,7 +15,7 @@ from plotly.subplots import make_subplots
|
|
15
15
|
from scipy import stats
|
16
16
|
import dataframe_image as dfi
|
17
17
|
|
18
|
-
from .hlp import wrap_text
|
18
|
+
from .hlp import wrap_text, to_series
|
19
19
|
|
20
20
|
import duckdb as ddb
|
21
21
|
|
@@ -696,7 +696,7 @@ def show_num_df(
|
|
696
696
|
|
697
697
|
|
698
698
|
|
699
|
-
def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
|
699
|
+
def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str=" ", precision: int=3):
|
700
700
|
"""
|
701
701
|
Print statistical summary for a pandas DataFrame or Series.
|
702
702
|
|
@@ -712,15 +712,44 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
|
|
712
712
|
df (Union[pd.DataFrame, pd.Series]): Input DataFrame or Series. Only numeric columns
|
713
713
|
in DataFrame are considered.
|
714
714
|
show (bool, optional): Whether to print the summary. Defaults to True.
|
715
|
-
name (str, optional): Prefix for the summary. Defaults to "
|
715
|
+
name (str, optional): Prefix for the summary. Defaults to " ".
|
716
|
+
precision (int, optional): Number of digits to round the results to. Defaults to 3.
|
716
717
|
"""
|
717
718
|
if df.empty:
|
718
719
|
return
|
719
720
|
|
720
721
|
# * drop NA to keep scipy sane
|
721
|
-
df = df.dropna().copy()
|
722
|
+
df = df.dropna().copy()
|
722
723
|
|
723
|
-
|
724
|
+
# display(df)
|
725
|
+
|
726
|
+
if len(df.columns) == 1:
|
727
|
+
df = df.to_series()
|
728
|
+
|
729
|
+
pd.api.types.is_numeric_dtype(df)
|
730
|
+
|
731
|
+
|
732
|
+
if not (
|
733
|
+
# * series must be numeric
|
734
|
+
(isinstance(df, pd.Series)
|
735
|
+
and pd.api.types.is_numeric_dtype(df)
|
736
|
+
)
|
737
|
+
or
|
738
|
+
# * df must have 2 columns str num
|
739
|
+
(len(df.columns) == 2
|
740
|
+
and (
|
741
|
+
(pd.api.types.is_object_dtype(df.iloc[:, 0]))
|
742
|
+
or (pd.api.types.is_bool_dtype(df.iloc[:, 0]))
|
743
|
+
)
|
744
|
+
and pd.api.types.is_numeric_dtype(df.iloc[:, 1])
|
745
|
+
)
|
746
|
+
):
|
747
|
+
print(f"❌ df must have 2 columns: [0] str or bool, [1] num, or be a series")
|
748
|
+
return
|
749
|
+
|
750
|
+
|
751
|
+
|
752
|
+
def print_summary_ser(ser: pd.Series, show: bool=True, name: str="", precision: int=3):
|
724
753
|
# Calculate IQR and pass `rng=(25, 75)` to get the interquartile range
|
725
754
|
iqr_value = stats.iqr(ser)
|
726
755
|
|
@@ -728,21 +757,21 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
|
|
728
757
|
# ser.dropna(inplace=True)
|
729
758
|
|
730
759
|
# Using the iqr function, we still calculate the bounds manually
|
731
|
-
q1 = stats.scoreatpercentile(ser, 25)
|
732
|
-
q3 = stats.scoreatpercentile(ser, 75)
|
760
|
+
q1 = round(stats.scoreatpercentile(ser, 25), precision)
|
761
|
+
q3 = round(stats.scoreatpercentile(ser, 75), precision)
|
733
762
|
|
734
763
|
# Calculate upper bound directly
|
735
|
-
min = round(ser.min(),
|
736
|
-
med = round(ser.median(),
|
737
|
-
upper = round(q3 + 1.5 * iqr_value,
|
738
|
-
lower = round(q1 - 1.5 * iqr_value,
|
739
|
-
mean = round(ser.mean(),
|
740
|
-
std = round(ser.std(),
|
741
|
-
cv = round(ser.std() / ser.mean(),
|
742
|
-
max = round(ser.max(),
|
743
|
-
sum = round(ser.sum(),
|
744
|
-
skew = round(stats.skew(ser.dropna().tolist()),
|
745
|
-
kurto = round(stats.kurtosis(ser.dropna().tolist()),
|
764
|
+
min = round(ser.min(), precision)
|
765
|
+
med = round(ser.median(), precision)
|
766
|
+
upper = round(q3 + 1.5 * iqr_value, precision)
|
767
|
+
lower = round(q1 - 1.5 * iqr_value, precision)
|
768
|
+
mean = round(ser.mean(), precision)
|
769
|
+
std = round(ser.std(), precision)
|
770
|
+
cv = round(ser.std() / ser.mean(), precision)
|
771
|
+
max = round(ser.max(), precision)
|
772
|
+
sum = round(ser.sum(), precision)
|
773
|
+
skew = round(stats.skew(ser.dropna().tolist()), precision)
|
774
|
+
kurto = round(stats.kurtosis(ser.dropna().tolist()), precision)
|
746
775
|
|
747
776
|
lower = min if lower < min else lower
|
748
777
|
upper = max if upper > max else upper
|
@@ -750,7 +779,7 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
|
|
750
779
|
# * extra care for scipy metrics, these are very vulnarable to nan
|
751
780
|
if show:
|
752
781
|
print(
|
753
|
-
f"""{name} min: {min:_} | lower: {lower:_} | q25: {q1:_} | median: {med:_} | mean: {mean:_} | q75: {q3:_} | upper: {upper:_} | max: {max:_} | std: {std:_} | cv: {cv:_} | sum: {sum:_} | skew: {skew} | kurto: {kurto}""")
|
782
|
+
f"""{name} -> min: {min:_} | lower: {lower:_} | q25: {q1:_} | median: {med:_} | mean: {mean:_} | q75: {q3:_} | upper: {upper:_} | max: {max:_} | std: {std:_} | cv: {cv:_} | sum: {sum:_} | skew: {skew} | kurto: {kurto}""")
|
754
783
|
|
755
784
|
summary = {
|
756
785
|
"min": min,
|
@@ -770,11 +799,22 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
|
|
770
799
|
return summary
|
771
800
|
|
772
801
|
if isinstance(df, pd.Series):
|
773
|
-
|
802
|
+
# * print serie
|
803
|
+
name = df.name if df.name else "series"
|
804
|
+
print_summary_ser(ser=df, show=show, name=name, precision=precision)
|
805
|
+
return
|
774
806
|
|
775
807
|
if isinstance(df, pd.DataFrame):
|
776
|
-
# *
|
777
|
-
|
778
|
-
|
808
|
+
# * print for all values
|
809
|
+
print(f"🟧 all data")
|
810
|
+
name = df.columns[-1]
|
811
|
+
summary = print_summary_ser(ser=df.iloc[:,1], show=show, name=name, precision=precision)
|
812
|
+
|
813
|
+
print(f"🟧 boxes")
|
814
|
+
# * print for each value
|
815
|
+
for item in df.iloc[:,0].unique():
|
816
|
+
# display(df[df.iloc[:,0] == item])
|
817
|
+
print_summary_ser(ser=df[df.iloc[:,0] == item].iloc[:,1], show=show, name=item, precision=precision)
|
779
818
|
|
780
819
|
return summary
|
820
|
+
|
@@ -0,0 +1,11 @@
|
|
1
|
+
pandas_plots/hlp.py,sha256=N6NrbFagVMMX-ZnV0rIBEz82SeSoOkksfMcCap55W7E,16588
|
2
|
+
pandas_plots/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
|
3
|
+
pandas_plots/pls.py,sha256=WpQ8hPmp8MbHvgEvSejDYFXyY_hZabLY4OLW8S6u15g,44310
|
4
|
+
pandas_plots/tbl.py,sha256=tuTDRFaD4lKQ2fMeMCJwnJL65zXuUGVQ6uwQNVa0y6Q,31883
|
5
|
+
pandas_plots/ven.py,sha256=2x3ACo2vSfO3q6fv-UdDQ0h1SJyt8WChBGgE5SDCdCk,11673
|
6
|
+
pandas_plots-0.12.7.dist-info/LICENSE,sha256=6KQ5KVAAhRaB-JJKpX4cefKvRZRgI7GUPc92_2d31XY,1051
|
7
|
+
pandas_plots-0.12.7.dist-info/METADATA,sha256=9i_TsDQinaUPz9eqJO7a0L4JFZmmM3l_WzoPbmDPH0Y,7358
|
8
|
+
pandas_plots-0.12.7.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
9
|
+
pandas_plots-0.12.7.dist-info/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
|
10
|
+
pandas_plots-0.12.7.dist-info/top_level.txt,sha256=XnaNuIHBqMmCeh_U7nKOYTwFue_SIA0wxuDgdPmnnSk,13
|
11
|
+
pandas_plots-0.12.7.dist-info/RECORD,,
|
@@ -1,11 +0,0 @@
|
|
1
|
-
pandas_plots/hlp.py,sha256=N6NrbFagVMMX-ZnV0rIBEz82SeSoOkksfMcCap55W7E,16588
|
2
|
-
pandas_plots/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
|
3
|
-
pandas_plots/pls.py,sha256=DsFnWbGNmMnZ8a2qnZFwXH6VekwPFaIwZEQ9TVp6xCg,43997
|
4
|
-
pandas_plots/tbl.py,sha256=4VvjLisPT1gSvgsLClcrhC7LIJ-_FPNla8nomGflGag,30509
|
5
|
-
pandas_plots/ven.py,sha256=2x3ACo2vSfO3q6fv-UdDQ0h1SJyt8WChBGgE5SDCdCk,11673
|
6
|
-
pandas_plots-0.12.6.dist-info/LICENSE,sha256=6KQ5KVAAhRaB-JJKpX4cefKvRZRgI7GUPc92_2d31XY,1051
|
7
|
-
pandas_plots-0.12.6.dist-info/METADATA,sha256=-mCMgoWTwG6HSL8JtuYvwM1LCkzglJm3aIocaUMijO4,7358
|
8
|
-
pandas_plots-0.12.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
9
|
-
pandas_plots-0.12.6.dist-info/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
|
10
|
-
pandas_plots-0.12.6.dist-info/top_level.txt,sha256=XnaNuIHBqMmCeh_U7nKOYTwFue_SIA0wxuDgdPmnnSk,13
|
11
|
-
pandas_plots-0.12.6.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|