PyPI - pandas-plots - Versions diffs - 0.12.6__py3-none-any.whl → 0.12.7__py3-none-any.whl - Mend

pandas-plots 0.12.6py3-none-any.whl → 0.12.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

pandas_plots/pls.py CHANGED Viewed

@@ -12,6 +12,7 @@ from matplotlib import pyplot as plt
 from plotly import express as px
 import plotly.graph_objects as go
 from plotly.subplots import make_subplots
+import plotly # needed for return types
 from .hlp import *
 from .tbl import print_summary
@@ -189,7 +190,7 @@ def plot_stacked_bars(
     png_path: Path | str = None,
     color_palette: str = "Plotly",
     null_label: str = "<NA>",
-) -> object:
+) -> plotly.graph_objects:
     """
     Generates a stacked bar plot using the provided DataFrame.
@@ -220,7 +221,7 @@ def plot_stacked_bars(
     - A Plotly figure object representing the stacked bar chart.
     """
     BAR_LENGTH_MULTIPLIER = 1.05
     # * 2 axis means at least 2 columns
     if len(df.columns) < 2 or len(df.columns) > 3:
         print("❌ df must have exactly 2 or 3 columns")
@@ -263,39 +264,47 @@ def plot_stacked_bars(
         .sum()
         .reset_index()
     )
+    # * add total as aggregation of df
+    if show_total:
+        df_total = df.groupby(df.columns[1], observed=True, as_index=False)[df.columns[2]].sum()
+        df_total[df.columns[0]] = " Total"
+        df = pd.concat([df, df_total], ignore_index=True)
+    # * apply top_n, reduce df
+    n_col = top_n_color if top_n_color > 0 else None
+    n_idx = top_n_index if top_n_index > 0 else None
+    unique_colors = sorted(
+        df.groupby(col_color)[df.columns[2]]
+        .sum()
+        .sort_values(ascending=False)
+        .index.tolist()[:n_col]
+    )
-    # * Sorting logic based on sort_values
+    unique_idx = df[col_index].sort_values().unique()[:n_idx]
+    df = df[df[col_color].isin(unique_colors)]#.sort_values(by=[col_index, col_color])
+    df = df[df[col_index].isin(unique_idx)]#.sort_values(by=[col_index, col_color])
+    # # * Sorting logic based on sort_values
     if sort_values:
         sort_order = (
             df.groupby(col_index)[df.columns[2]].sum().sort_values(ascending=False).index
         )
     else:
         sort_order = sorted(df[col_index].unique())  # Alphabetical order
-    df[col_index] = pd.Categorical(df[col_index], categories=sort_order, ordered=True)
-    # * add total as aggregation of df
-    if show_total:
-        df_total = df.copy()
-        df_total[col_index] = " TOTAL"  # add space to make this item first
-        df = pd.concat([df, df_total])
-    # * Convert to categorical with explicit ordering
+    # # * Convert to categorical with explicit ordering
     df[col_index] = pd.Categorical(df[col_index], categories=sort_order, ordered=True)
-    if top_n_index > 0 and len(sort_order) > top_n_index:
-        top_categories = sort_order[:top_n_index]
-        df[col_index] = df[col_index].apply(lambda x: x if x in top_categories else "<other>")
-    unique_colors = sorted(df[col_color].unique())
-    if top_n_color > 0 and len(unique_colors) > top_n_color:
-        top_colors = unique_colors[:top_n_color]
-        df[col_color] = df[col_color].apply(lambda x: x if x in top_colors else "<other>")
-    column_colors = assign_column_colors(sorted(df[col_color].unique()), color_palette, null_label)
-    # # * assign colors to columns
-    # unique_colors = sorted(df[col_color].unique())
-    # column_colors = assign_column_colors(unique_colors, color_palette, null_label)
+    column_colors = assign_column_colors(
+        columns=unique_colors,
+        color_palette=color_palette,
+        null_label=null_label
+        )
     # * calculate n
     divider = 2 if show_total else 1
@@ -308,13 +317,24 @@ def plot_stacked_bars(
     _title_str_n = f", n={n:_}"
     caption = _set_caption(caption)
+        # * after grouping add cols for pct and formatting
+    df["pct"] = df[df.columns[2]].apply(lambda x: f"{(x / n) * 100:.{precision}f}%")
+    # * format output
+    df["cnt_str"] = df[df.columns[2]].apply(lambda x: f"{x:_.{precision}f}")
+    divider2 = "<br>" if orientation == "v" else " "
+    df["cnt_pct_str"] = df.apply(
+        lambda row: f"{row['cnt_str']}{divider2}({row['pct']})", axis=1
+    )
     # * plot
     fig = px.bar(
         df,
         x=col_index if orientation == "v" else df.columns[2],
         y=df.columns[2] if orientation == "v" else col_index,
         color=col_color,
-        text=df.columns[2],
+        text="cnt_pct_str" if normalize else "cnt_str",
         orientation=orientation,
         title=title
         or f"{caption}{_title_str_top_index}[{col_index}] by {_title_str_top_color}[{col_color}]{_title_str_null}{_title_str_n}",
@@ -323,9 +343,9 @@ def plot_stacked_bars(
         height=height,
         color_discrete_map=column_colors,  # Use assigned colors
         category_orders={col_index: list(df[col_index].cat.categories)},  # <- Add this line
-        # category_orders={col_index: df[col_index].categories.tolist() if isinstance(df[col_index].dtype, pd.CategoricalDtype) else sorted(df[col_index].unique())}
     )
         # * get longest bar
     bar_max = (
         df.groupby(col_index)[df.columns[2]].sum().sort_values(ascending=False).iloc[0]
@@ -488,7 +508,10 @@ def plot_bars(
     # * after grouping add cols for pct and formatting
     df["pct"] = df[df.columns[1]] / n
+    # * format output
     df["cnt_str"] = df[df.columns[1]].apply(lambda x: f"{x:_.{precision}f}")
     divider = "<br>" if orientation == "v" else " "
     df["cnt_pct_str"] = df.apply(
         lambda row: f"{row['cnt_str']}{divider}({row['pct']:.1%})", axis=1
@@ -991,7 +1014,7 @@ def plot_boxes(
     points: Literal["all", "outliers", "suspectedoutliers", None] = None,
     precision: int = 2,
     height: int = 600,
-    width: int = 800,
+    width: int = 1200,
     annotations: bool = True,
     summary: bool = True,
     title: str = None,
@@ -1018,7 +1041,7 @@ def plot_boxes(
     if (
         len(df.columns) != 2
         or not (
-            (pd.api.types.is_string_dtype(df.iloc[:, 0]))
+            (pd.api.types.is_object_dtype(df.iloc[:, 0]))
             or (pd.api.types.is_bool_dtype(df.iloc[:, 0]))
         )
         or not pd.api.types.is_numeric_dtype(df.iloc[:, 1])
@@ -1124,7 +1147,7 @@ def plot_boxes(
     fig.show("png")
     if summary:
-        print_summary(df)
+        print_summary(df=df, precision=precision)
     # * save to png if path is provided
     if png_path is not None:

pandas_plots/tbl.py CHANGED Viewed

@@ -15,7 +15,7 @@ from plotly.subplots import make_subplots
 from scipy import stats
 import dataframe_image as dfi
-from .hlp import wrap_text
+from .hlp import wrap_text, to_series
 import duckdb as ddb
@@ -696,7 +696,7 @@ def show_num_df(
-def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="🟠 "):
+def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str=" ", precision: int=3):
     """
     Print statistical summary for a pandas DataFrame or Series.
@@ -712,15 +712,44 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
         df (Union[pd.DataFrame, pd.Series]): Input DataFrame or Series. Only numeric columns
         in DataFrame are considered.
         show (bool, optional): Whether to print the summary. Defaults to True.
-        name (str, optional): Prefix for the summary. Defaults to "🟠 "
+        name (str, optional): Prefix for the summary. Defaults to " ".
+        precision (int, optional): Number of digits to round the results to. Defaults to 3.
     """
     if df.empty:
         return
     # * drop NA to keep scipy sane
-    df = df.dropna().copy()
+    df = df.dropna().copy()
-    def print_summary_ser(ser: pd.Series, show: bool=True, name: str=""):
+    # display(df)
+    if len(df.columns) == 1:
+        df = df.to_series()
+    pd.api.types.is_numeric_dtype(df)
+    if not (
+        # * series must be numeric
+        (isinstance(df, pd.Series)
+            and pd.api.types.is_numeric_dtype(df)
+        )
+        or
+        # * df must have 2 columns str num
+        (len(df.columns) == 2
+            and (
+                (pd.api.types.is_object_dtype(df.iloc[:, 0]))
+                or (pd.api.types.is_bool_dtype(df.iloc[:, 0]))
+                )
+            and pd.api.types.is_numeric_dtype(df.iloc[:, 1])
+        )
+    ):
+        print(f"❌ df must have 2 columns: [0] str or bool, [1] num, or be a series")
+        return
+    def print_summary_ser(ser: pd.Series, show: bool=True, name: str="", precision: int=3):
         # Calculate IQR and pass `rng=(25, 75)` to get the interquartile range
         iqr_value = stats.iqr(ser)
@@ -728,21 +757,21 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
         # ser.dropna(inplace=True)
         # Using the iqr function, we still calculate the bounds manually
-        q1 = stats.scoreatpercentile(ser, 25)
-        q3 = stats.scoreatpercentile(ser, 75)
+        q1 = round(stats.scoreatpercentile(ser, 25), precision)
+        q3 = round(stats.scoreatpercentile(ser, 75), precision)
         # Calculate upper bound directly
-        min = round(ser.min(),3)
-        med = round(ser.median(),3)
-        upper = round(q3 + 1.5 * iqr_value,3)
-        lower = round(q1 - 1.5 * iqr_value,3)
-        mean = round(ser.mean(),3)
-        std = round(ser.std(),3)
-        cv = round(ser.std() / ser.mean(),3)
-        max = round(ser.max(),3)
-        sum = round(ser.sum(),3)
-        skew = round(stats.skew(ser.dropna().tolist()),3)
-        kurto = round(stats.kurtosis(ser.dropna().tolist()),3)
+        min = round(ser.min(), precision)
+        med = round(ser.median(), precision)
+        upper = round(q3 + 1.5 * iqr_value, precision)
+        lower = round(q1 - 1.5 * iqr_value, precision)
+        mean = round(ser.mean(), precision)
+        std = round(ser.std(), precision)
+        cv = round(ser.std() / ser.mean(), precision)
+        max = round(ser.max(), precision)
+        sum = round(ser.sum(), precision)
+        skew = round(stats.skew(ser.dropna().tolist()), precision)
+        kurto = round(stats.kurtosis(ser.dropna().tolist()), precision)
         lower = min if lower < min else lower
         upper = max if upper > max else upper
@@ -750,7 +779,7 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
         # * extra care for scipy metrics, these are very vulnarable to nan
         if show:
             print(
-                f"""{name} min: {min:_} | lower: {lower:_} | q25: {q1:_} | median: {med:_} | mean: {mean:_} | q75: {q3:_} | upper: {upper:_} | max: {max:_} | std: {std:_} | cv: {cv:_} | sum: {sum:_} | skew: {skew} | kurto: {kurto}""")
+                f"""{name} -> min: {min:_} | lower: {lower:_} | q25: {q1:_} | median: {med:_} | mean: {mean:_} | q75: {q3:_} | upper: {upper:_} | max: {max:_} | std: {std:_} | cv: {cv:_} | sum: {sum:_} | skew: {skew} | kurto: {kurto}""")
         summary = {
             "min": min,
@@ -770,11 +799,22 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
         return summary
     if isinstance(df, pd.Series):
-        return print_summary_ser(df, show=show, name=name)
+        # * print serie
+        name = df.name if df.name else "series"
+        print_summary_ser(ser=df, show=show, name=name, precision=precision)
+        return
     if isinstance(df, pd.DataFrame):
-        # * only show numerics
-        for col in df.select_dtypes("number").columns:
-            summary = print_summary_ser(ser=df[col],show=show, name=col)
+        # * print for all values
+        print(f"🟧 all data")
+        name = df.columns[-1]
+        summary = print_summary_ser(ser=df.iloc[:,1], show=show, name=name, precision=precision)
+        print(f"🟧 boxes")
+        # * print for each value
+        for item in df.iloc[:,0].unique():
+            # display(df[df.iloc[:,0] == item])
+            print_summary_ser(ser=df[df.iloc[:,0] == item].iloc[:,1], show=show, name=item, precision=precision)
     return summary

{pandas_plots-0.12.6.dist-info → pandas_plots-0.12.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: pandas-plots
-Version: 0.12.6
+Version: 0.12.7
 Summary: A collection of helper for table handling and visualization
 Home-page: https://github.com/smeisegeier/pandas-plots
 Author: smeisegeier

pandas_plots-0.12.7.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,11 @@
+pandas_plots/hlp.py,sha256=N6NrbFagVMMX-ZnV0rIBEz82SeSoOkksfMcCap55W7E,16588
+pandas_plots/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
+pandas_plots/pls.py,sha256=WpQ8hPmp8MbHvgEvSejDYFXyY_hZabLY4OLW8S6u15g,44310
+pandas_plots/tbl.py,sha256=tuTDRFaD4lKQ2fMeMCJwnJL65zXuUGVQ6uwQNVa0y6Q,31883
+pandas_plots/ven.py,sha256=2x3ACo2vSfO3q6fv-UdDQ0h1SJyt8WChBGgE5SDCdCk,11673
+pandas_plots-0.12.7.dist-info/LICENSE,sha256=6KQ5KVAAhRaB-JJKpX4cefKvRZRgI7GUPc92_2d31XY,1051
+pandas_plots-0.12.7.dist-info/METADATA,sha256=9i_TsDQinaUPz9eqJO7a0L4JFZmmM3l_WzoPbmDPH0Y,7358
+pandas_plots-0.12.7.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+pandas_plots-0.12.7.dist-info/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
+pandas_plots-0.12.7.dist-info/top_level.txt,sha256=XnaNuIHBqMmCeh_U7nKOYTwFue_SIA0wxuDgdPmnnSk,13
+pandas_plots-0.12.7.dist-info/RECORD,,

pandas_plots-0.12.6.dist-info/RECORD DELETED Viewed

@@ -1,11 +0,0 @@
-pandas_plots/hlp.py,sha256=N6NrbFagVMMX-ZnV0rIBEz82SeSoOkksfMcCap55W7E,16588
-pandas_plots/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
-pandas_plots/pls.py,sha256=DsFnWbGNmMnZ8a2qnZFwXH6VekwPFaIwZEQ9TVp6xCg,43997
-pandas_plots/tbl.py,sha256=4VvjLisPT1gSvgsLClcrhC7LIJ-_FPNla8nomGflGag,30509
-pandas_plots/ven.py,sha256=2x3ACo2vSfO3q6fv-UdDQ0h1SJyt8WChBGgE5SDCdCk,11673
-pandas_plots-0.12.6.dist-info/LICENSE,sha256=6KQ5KVAAhRaB-JJKpX4cefKvRZRgI7GUPc92_2d31XY,1051
-pandas_plots-0.12.6.dist-info/METADATA,sha256=-mCMgoWTwG6HSL8JtuYvwM1LCkzglJm3aIocaUMijO4,7358
-pandas_plots-0.12.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-pandas_plots-0.12.6.dist-info/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
-pandas_plots-0.12.6.dist-info/top_level.txt,sha256=XnaNuIHBqMmCeh_U7nKOYTwFue_SIA0wxuDgdPmnnSk,13
-pandas_plots-0.12.6.dist-info/RECORD,,

{pandas_plots-0.12.6.dist-info → pandas_plots-0.12.7.dist-info}/LICENSE RENAMED Viewed

File without changes

{pandas_plots-0.12.6.dist-info → pandas_plots-0.12.7.dist-info}/WHEEL RENAMED Viewed

File without changes

{pandas_plots-0.12.6.dist-info → pandas_plots-0.12.7.dist-info}/pii.py RENAMED Viewed

File without changes

{pandas_plots-0.12.6.dist-info → pandas_plots-0.12.7.dist-info}/top_level.txt RENAMED Viewed

File without changes

pandas-plots 0.12.6__py3-none-any.whl → 0.12.7__py3-none-any.whl

pandas-plots 0.12.6py3-none-any.whl → 0.12.7py3-none-any.whl