PyPI - pandas-plots - Versions diffs - 0.12.6__py3-none-any.whl → 0.12.8__py3-none-any.whl - Mend

pandas-plots 0.12.6py3-none-any.whl → 0.12.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

pandas_plots/pls.py CHANGED Viewed

@@ -1,7 +1,4 @@
 from pathlib import Path
-import warnings
-warnings.filterwarnings("ignore")
 import os
 from typing import Optional, Literal
@@ -12,50 +9,118 @@ from matplotlib import pyplot as plt
 from plotly import express as px
 import plotly.graph_objects as go
 from plotly.subplots import make_subplots
+import plotly  # needed for return types
 from .hlp import *
 from .tbl import print_summary
 ### helper functions
 def _set_caption(caption: str) -> str:
     return f"#️⃣{'-'.join(caption.split())}, " if caption else ""
-def aggregate_data(df: pd.DataFrame, top_n_index: int, top_n_columns: int, top_n_facet: int, null_label: str) -> pd.DataFrame:
+def aggregate_data(
+    df: pd.DataFrame,
+    top_n_index: int,
+    top_n_color: int,
+    top_n_facet: int,
+    null_label: str,
+    show_other: bool = False,
+    sort_values_index: bool = False,
+    sort_values_color: bool = False,
+    sort_values_facet: bool = False,
+) -> pd.DataFrame:
     """
     Aggregates the data, ensuring each combination of 'index', 'col', and 'facet' is unique with summed 'value'.
     Args:
         df (pd.DataFrame): Input DataFrame.
         top_n_index (int): top N values of the first column to keep. 0 means take all.
-        top_n_columns (int): top N values of the second column to keep. 0 means take all.
+        top_n_color (int): top N values of the second column to keep. 0 means take all.
         top_n_facet (int): top N values of the third column to keep. 0 means take all.
         null_label (str): Label for null values.
+        show_other (bool): Whether to include "<other>" for columns not in top_n_color. Defaults to False.
+        sort_values (bool): Whether to sort values in descending order based on group sum. Defaults to False.
     Returns:
         pd.DataFrame: Aggregated and filtered dataset.
     """
-    for col in ['index', 'col', 'facet']:  # Skip 'value' column (numeric)
+    for col in ["index", "col", "facet"]:  # Skip 'value' column (numeric)
         df[col] = df[col].fillna(null_label)
     # Aggregate data to ensure unique combinations
-    aggregated_df = df.groupby(['index', 'col', 'facet'], as_index=False)['value'].sum()
+    aggregated_df = df.groupby(["index", "col", "facet"], as_index=False)["value"].sum()
+    # * Reduce data based on top_n parameters
+    if sort_values_index:
+        top_indexes = (
+            aggregated_df.groupby("index")["value"]
+            .sum()
+            .sort_values(ascending=False)[:top_n_index or None]
+            .index
+        )
+    else:
+        top_indexes = aggregated_df["index"].sort_values().unique()[:top_n_index or None]
+    aggregated_df = aggregated_df[aggregated_df["index"].isin(top_indexes)]
+    if sort_values_color:
+        top_colors = (
+            aggregated_df.groupby("col")["value"]
+            .sum()
+            .sort_values(ascending=False)[:top_n_color or None]
+            .index
+        )
+    else:
+        top_colors = aggregated_df["col"].sort_values().unique()[:top_n_color or None]
+    others_df = df[~df["col"].isin(top_colors)]
+    aggregated_df = aggregated_df[aggregated_df["col"].isin(top_colors)]
+    if show_other and top_n_color > 0 and not others_df.empty:
+        other_agg = others_df.groupby(["index", "facet"], as_index=False)[
+            "value"
+        ].sum()
+        other_agg["col"] = "<other>"
+        other_agg = other_agg[["index", "col", "facet", "value"]]
+        aggregated_df = pd.concat([aggregated_df, other_agg], ignore_index=True)
+        top_colors = [*top_colors, "<other>"]
+    if sort_values_facet:
+        top_facets = (
+            aggregated_df.groupby("facet")["value"]
+            .sum()
+            .sort_values(ascending=False)[:top_n_facet or None]
+            .index
+        )
+    else:
+        top_facets = aggregated_df["facet"].sort_values().unique()[:top_n_facet or None]
+    aggregated_df = aggregated_df[aggregated_df["facet"].isin(top_facets)]
+    # * Ensure facets are sorted alphabetically
+    aggregated_df["facet"] = pd.Categorical(
+        values=aggregated_df["facet"],
+        categories=top_facets,
+        ordered=True,
+    )
+    aggregated_df["index"] = pd.Categorical(
+        values=aggregated_df["index"],
+        categories=top_indexes,
+        ordered=True,
+    )
+    aggregated_df["col"] = pd.Categorical(
+        values=aggregated_df["col"],
+        categories=top_colors,
+        ordered=True,
+    )
-    # Reduce data based on top_n parameters
-    if top_n_index > 0:
-        top_indexes = aggregated_df.groupby('index')['value'].sum().nlargest(top_n_index).index
-        aggregated_df = aggregated_df[aggregated_df['index'].isin(top_indexes)]
-    if top_n_columns > 0:
-        top_columns = aggregated_df.groupby('col')['value'].sum().nlargest(top_n_columns).index
-        aggregated_df = aggregated_df[aggregated_df['col'].isin(top_columns)]
-    if top_n_facet > 0:
-        top_facets = aggregated_df.groupby('facet')['value'].sum().nlargest(top_n_facet).index
-        aggregated_df = aggregated_df[aggregated_df['facet'].isin(top_facets)]
-    # Ensure facets are sorted alphabetically
-    aggregated_df['facet'] = pd.Categorical(aggregated_df['facet'], sorted(aggregated_df['facet'].unique()))
-    aggregated_df = aggregated_df.sort_values(by='facet')
+    # aggregated_df = aggregated_df.sort_values(by="facet")
     return aggregated_df
@@ -76,13 +141,15 @@ def assign_column_colors(columns, color_palette, null_label):
         palette = getattr(px.colors.qualitative, color_palette)
     else:
         raise ValueError(f"Invalid color palette: {color_palette}")
     colors = {col: palette[i % len(palette)] for i, col in enumerate(sorted(columns))}
     colors[null_label] = "lightgray"
     return colors
 ### main functions
 def plot_quadrants(
     df: pd.DataFrame,
     title: str = None,
@@ -162,7 +229,7 @@ def plot_quadrants(
     # * save to png if path is provided
     if png_path is not None:
-        plt.savefig(Path(png_path).as_posix(), format='png')
+        plt.savefig(Path(png_path).as_posix(), format="png")
     return q1, q2, q3, q4, n
     # * plotly express is not used for the heatmap, although it does not need the derived wide format.
@@ -184,12 +251,15 @@ def plot_stacked_bars(
     renderer: Literal["png", "svg", None] = "png",
     caption: str = None,
     sort_values: bool = False,
+    sort_values_index: bool = False,
+    sort_values_color: bool = False,
     show_total: bool = False,
     precision: int = 0,
     png_path: Path | str = None,
     color_palette: str = "Plotly",
     null_label: str = "<NA>",
-) -> object:
+    show_other: bool = False,
+) -> plotly.graph_objects:
     """
     Generates a stacked bar plot using the provided DataFrame.
@@ -207,7 +277,7 @@ def plot_stacked_bars(
     - title (str): Custom title for the plot.
     - renderer (Literal["png", "svg", None]): Defines the output format.
     - caption (str): Optional caption for additional context.
-    - sort_values (bool):
+    - sort_values (bool):
         - If True, sorts bars by the sum of their values (descending).
         - If False, sorts bars alphabetically.
     - show_total (bool): If True, adds a row with the total sum of all categories.
@@ -215,7 +285,10 @@ def plot_stacked_bars(
     - png_path (Path | str): If specified, saves the plot as a PNG file.
     - color_palette (str): Name of the color palette to use.
     - null_label (str): Label for null values.
+    - show_other (bool): If True, shows the "Other" category in the legend.
+    - sort_values_index (bool): If True, sorts the index categories by group sum
+    - sort_values_color (bool): If True, sorts the columns categories by group sum
     Returns:
     - A Plotly figure object representing the stacked bar chart.
     """
@@ -226,9 +299,19 @@ def plot_stacked_bars(
         print("❌ df must have exactly 2 or 3 columns")
         return
-    # * check if first 2 columns are str
-    if list(set((df.iloc[:, [0, 1]].dtypes)))[0].kind not in ["O", "b"]:
-        print("❌ first 2 columns must be str")
+    # ! do not enforce str columns anymore
+    # # * check if first 2 columns are str
+    # dtypes = set(df.iloc[:, [0, 1]].dtypes)
+    # dtypes_kind = [i.kind for i in dtypes]
+    # if set(dtypes_kind) - set(["O", "b"]):
+    #     print("❌ first 2 columns must be str")
+    #     # * overkill ^^
+    # df.iloc[:, [0, 1]] = df.iloc[:, [0, 1]].astype(str)
+    # * but last col must be numeric
+    if df.iloc[:, -1].dtype.kind not in ("f", "i"):
+        print("❌ last column must be numeric")
         return
     df = df.copy()  # Copy the input DataFrame to avoid modifying the original
@@ -252,69 +335,103 @@ def plot_stacked_bars(
     # * apply precision
     df.iloc[:, 2] = df.iloc[:, 2].round(precision)
-    # * set index + color col
+    # # * set index + color col
     col_index = df.columns[0] if not swap else df.columns[1]
     col_color = df.columns[1] if not swap else df.columns[0]
     # * ensure df is grouped to prevent false aggregations
-    df = (
-        df.groupby([df.columns[0], df.columns[1]])
-        [df.columns[2]]
-        .sum()
-        .reset_index()
-    )
-    # * Sorting logic based on sort_values
-    if sort_values:
-        sort_order = (
-            df.groupby(col_index)[df.columns[2]].sum().sort_values(ascending=False).index
-        )
-    else:
-        sort_order = sorted(df[col_index].unique())  # Alphabetical order
-    df[col_index] = pd.Categorical(df[col_index], categories=sort_order, ordered=True)
+    df = df.groupby([df.columns[0], df.columns[1]])[df.columns[2]].sum().reset_index()
     # * add total as aggregation of df
     if show_total:
-        df_total = df.copy()
-        df_total[col_index] = " TOTAL"  # add space to make this item first
-        df = pd.concat([df, df_total])
-    # * Convert to categorical with explicit ordering
-    df[col_index] = pd.Categorical(df[col_index], categories=sort_order, ordered=True)
-    if top_n_index > 0 and len(sort_order) > top_n_index:
-        top_categories = sort_order[:top_n_index]
-        df[col_index] = df[col_index].apply(lambda x: x if x in top_categories else "<other>")
-    unique_colors = sorted(df[col_color].unique())
-    if top_n_color > 0 and len(unique_colors) > top_n_color:
-        top_colors = unique_colors[:top_n_color]
-        df[col_color] = df[col_color].apply(lambda x: x if x in top_colors else "<other>")
-    column_colors = assign_column_colors(sorted(df[col_color].unique()), color_palette, null_label)
-    # # * assign colors to columns
-    # unique_colors = sorted(df[col_color].unique())
-    # column_colors = assign_column_colors(unique_colors, color_palette, null_label)
+        df_total = df.groupby(df.columns[1], observed=True, as_index=False)[
+            df.columns[2]
+        ].sum()
+        df_total[df.columns[0]] = " Total"
+        df = pd.concat([df, df_total], ignore_index=True)
     # * calculate n
     divider = 2 if show_total else 1
-    n = int(df[df.columns[2]].sum() / divider)
+    n = int(df.iloc[:, 2].sum() / divider)
     # * title str
     _title_str_top_index = f"TOP{top_n_index} " if top_n_index > 0 else ""
     _title_str_top_color = f"TOP{top_n_color} " if top_n_color > 0 else ""
     _title_str_null = f", NULL excluded" if dropna else ""
     _title_str_n = f", n={n:_}"
+    _df = df.copy().assign(facet=None)
+    _df.columns = (
+        ["index", "col", "value", "facet"]
+        if not swap
+        else ["col", "index", "value", "facet"]
+    )
+    aggregated_df = aggregate_data(
+        df=_df,
+        top_n_index=top_n_index,
+        top_n_color=top_n_color,
+        top_n_facet=0,
+        null_label=null_label,
+        show_other=show_other,
+        sort_values_index=sort_values_index,
+        sort_values_color=sort_values_color,
+        sort_values_facet=False, # just a placeholder
+    )
+    df = aggregated_df.copy()
+    columns = sorted(
+        df.groupby("col", observed=True)["value"]
+        .sum()
+        .sort_values(ascending=False)
+        .index.tolist()
+    )
+    column_colors = assign_column_colors(columns, color_palette, null_label)
     caption = _set_caption(caption)
+    # * after grouping add cols for pct and formatting
+    df["cnt_pct_only"] = df["value"].apply(lambda x: f"{(x / n) * 100:.{precision}f}%")
+    # * format output
+    df["cnt_str"] = df["value"].apply(lambda x: f"{x:_.{precision}f}")
+    divider2 = "<br>" if orientation == "v" else " "
+    df["cnt_pct_str"] = df.apply(
+        lambda row: f"{row['cnt_str']}{divider2}({row['cnt_pct_only']})", axis=1
+    )
+    # # # * Sorting logic based on sort_values
+    # if sort_values_index:
+    #     sort_order = (
+    #         df.groupby("index")["value"].sum().sort_values(ascending=False).index
+    #     )
+    # else:
+    #     sort_order = sorted(df["index"].unique(), reverse=False)  # Alphabetical order
+    # display(sort_order)
+    # df["index"] = pd.Categorical(
+    #     values=df["index"],
+    #     # categories=sort_order,
+    #     ordered=True,
+    # )
+    df = (
+        df.sort_values(by="index", ascending=False)
+        if orientation == "h"
+        else df.sort_values(by="index", ascending=True)
+    )
+    # display(df)
     # * plot
     fig = px.bar(
         df,
-        x=col_index if orientation == "v" else df.columns[2],
-        y=df.columns[2] if orientation == "v" else col_index,
-        color=col_color,
-        text=df.columns[2],
+        x="index" if orientation == "v" else "value",
+        y="value" if orientation == "v" else "index",
+        color="col",
+        text="cnt_pct_str" if normalize else "cnt_str",
         orientation=orientation,
         title=title
         or f"{caption}{_title_str_top_index}[{col_index}] by {_title_str_top_color}[{col_color}]{_title_str_null}{_title_str_n}",
@@ -322,13 +439,15 @@ def plot_stacked_bars(
         width=width,
         height=height,
         color_discrete_map=column_colors,  # Use assigned colors
-        category_orders={col_index: list(df[col_index].cat.categories)},  # <- Add this line
-        # category_orders={col_index: df[col_index].categories.tolist() if isinstance(df[col_index].dtype, pd.CategoricalDtype) else sorted(df[col_index].unique())}
+        category_orders={
+            col_index: list(df["index"].cat.categories)
+        },  # <- Add this line
     )
-        # * get longest bar
+    # * get longest bar
     bar_max = (
-        df.groupby(col_index)[df.columns[2]].sum().sort_values(ascending=False).iloc[0]
+        df.groupby("index")["value"].sum().sort_values(ascending=False).iloc[0]
         * BAR_LENGTH_MULTIPLIER
     )
     # * ignore if bar mode is on
@@ -352,7 +471,7 @@ def plot_stacked_bars(
             },
         },
     )
     # * set dtick
     if orientation == "h":
         if relative:
@@ -488,7 +607,10 @@ def plot_bars(
     # * after grouping add cols for pct and formatting
     df["pct"] = df[df.columns[1]] / n
+    # * format output
     df["cnt_str"] = df[df.columns[1]].apply(lambda x: f"{x:_.{precision}f}")
     divider = "<br>" if orientation == "v" else " "
     df["cnt_pct_str"] = df.apply(
         lambda row: f"{row['cnt_str']}{divider}({row['pct']:.1%})", axis=1
@@ -669,7 +791,7 @@ def plot_histogram(
         caption (str): The caption for the plot. Default is None.
         title (str): The title of the plot. Default is None.
         png_path (Path | str, optional): The path to save the image as a png file. Defaults to None.
     Returns:
         plot object
@@ -721,7 +843,7 @@ def plot_histogram(
     )
     fig.show(renderer)
     # * save to png if path is provided
     if png_path is not None:
         fig.write_image(Path(png_path).as_posix())
@@ -991,7 +1113,7 @@ def plot_boxes(
     points: Literal["all", "outliers", "suspectedoutliers", None] = None,
     precision: int = 2,
     height: int = 600,
-    width: int = 800,
+    width: int = 1200,
     annotations: bool = True,
     summary: bool = True,
     title: str = None,
@@ -1018,7 +1140,7 @@ def plot_boxes(
     if (
         len(df.columns) != 2
         or not (
-            (pd.api.types.is_string_dtype(df.iloc[:, 0]))
+            (pd.api.types.is_object_dtype(df.iloc[:, 0]))
             or (pd.api.types.is_bool_dtype(df.iloc[:, 0]))
         )
         or not pd.api.types.is_numeric_dtype(df.iloc[:, 1])
@@ -1124,7 +1246,7 @@ def plot_boxes(
     fig.show("png")
     if summary:
-        print_summary(df)
+        print_summary(df=df, precision=precision)
     # * save to png if path is provided
     if png_path is not None:
@@ -1133,12 +1255,11 @@ def plot_boxes(
     return fig
 def plot_facet_stacked_bars(
     df: pd.DataFrame,
     subplots_per_row: int = 4,
     top_n_index: int = 0,
-    top_n_columns: int = 0,
+    top_n_color: int = 0,
     top_n_facet: int = 0,
     null_label: str = "<NA>",
     subplot_size: int = 300,
@@ -1148,6 +1269,12 @@ def plot_facet_stacked_bars(
     annotations: bool = False,
     precision: int = 0,
     png_path: Optional[Path] = None,
+    show_other: bool = False,
+    sort_values: bool = True,
+    sort_values_index: bool = False,
+    sort_values_color: bool = False,
+    sort_values_facet: bool = False,
 ) -> object:
     """
     Create a grid of stacked bar charts.
@@ -1156,7 +1283,7 @@ def plot_facet_stacked_bars(
         df (pd.DataFrame): DataFrame with 3 or 4 columns.
         subplots_per_row (int): Number of subplots per row.
         top_n_index (int): top N index values to keep.
-        top_n_columns (int): top N column values to keep.
+        top_n_color (int): top N column values to keep.
         top_n_facet (int): top N facet values to keep.
         null_label (str): Label for null values.
         subplot_size (int): Size of each subplot.
@@ -1166,47 +1293,57 @@ def plot_facet_stacked_bars(
         annotations (bool): Whether to show annotations in the subplots.
         precision (int): Decimal precision for annotations.
         png_path (Optional[Path]): Path to save the image.
+        show_other (bool): If True, adds an "<other>" bar for columns not in top_n_color.
+        sort_values_index (bool): If True, sorts index by group sum.
+        sort_values_color (bool): If True, sorts columns by group sum.
+        sort_values_facet (bool): If True, sorts facet by group sum.
+        sort_values (bool): DEPRECATED
     Returns:
         plot object
     Remarks:
         If you need to include facets that have no data, fill up like this beforehand:
         df.loc[len(df)]=[None, None, 12]
     """
     df = df.copy()  # Copy the input DataFrame to avoid modifying the original
     if not (df.shape[1] == 3 or df.shape[1] == 4):
         raise ValueError("Input DataFrame must have 3 or 4 columns.")
     original_column_names = df.columns.tolist()
+    original_rows = len(df)
     if df.shape[1] == 3:
-        df.columns = ['index', 'col', 'facet']
-        df['value'] = 1
+        df.columns = ["index", "col", "facet"]
+        df["value"] = 1
     elif df.shape[1] == 4:
-        df.columns = ['index', 'col', 'facet', 'value']
-    aggregated_df = aggregate_data(df, top_n_index, top_n_columns, top_n_facet, null_label)
-    # facets = aggregated_df['facet'].unique()
-    facets = sorted(aggregated_df['facet'].unique())  # Ensure facets are sorted consistently
+        df.columns = ["index", "col", "facet", "value"]
-    if top_n_columns > 0:
-        top_columns = aggregated_df.groupby('col', observed=True)['value'].sum().nlargest(top_n_columns).index.tolist()
-        # aggregated_df['col'] = aggregated_df['col'].apply(lambda x: x if x in top_columns else "<other>")
-        # aggregated_df['col'] = pd.Categorical(aggregated_df['col'], categories=top_columns + ["<other>"], ordered=True)
-        # aggregated_df['col'] = pd.Categorical(
-        #     aggregated_df['col'].map(lambda x: x if x in top_columns else "<other>"),
-        #     categories=top_columns + ["<other>"],
-        #     ordered=True
-        # )
-        aggregated_df['col'] = aggregated_df['col'].apply(lambda x: x if x in top_columns else "<other>")
+    aggregated_df = aggregate_data(
+        df,
+        top_n_index,
+        top_n_color,
+        top_n_facet,
+        null_label,
+        show_other=show_other,
+        sort_values_index=sort_values_index,
+        sort_values_color=sort_values_color,
+        sort_values_facet=sort_values_facet,
+    )
+    facets = sorted(
+        aggregated_df["facet"].unique()
+    )  # Ensure facets are sorted consistently
-    # columns = sorted(aggregated_df['col'].unique())
-    columns = aggregated_df.groupby('col', observed=True)['value'].sum().sort_values(ascending=False).index.tolist()
+    columns = sorted(
+        aggregated_df.groupby("col", observed=True)["value"]
+        .sum()
+        .sort_values(ascending=False)
+        .index.tolist()
+    )
     column_colors = assign_column_colors(columns, color_palette, null_label)
     fig = make_subplots(
@@ -1215,25 +1352,39 @@ def plot_facet_stacked_bars(
         subplot_titles=facets,
     )
+    # * Ensure all categories appear in the legend by adding an invisible trace
+    for column in columns:
+        fig.add_trace(
+            go.Bar(
+                x=[None],  # Invisible bar
+                y=[None],
+                name=column,
+                marker=dict(color=column_colors[column]),
+                showlegend=True,  # Ensure it appears in the legend
+            )
+        )
     added_to_legend = set()
     for i, facet in enumerate(facets):
-        facet_data = aggregated_df[aggregated_df['facet'] == facet]
+        facet_data = aggregated_df[aggregated_df["facet"] == facet]
         row = (i // subplots_per_row) + 1
         col = (i % subplots_per_row) + 1
         for column in columns:
-            column_data = facet_data[facet_data['col'] == column]
+            column_data = facet_data[facet_data["col"] == column]
             show_legend = column not in added_to_legend
             if show_legend:
                 added_to_legend.add(column)
             fig.add_trace(
                 go.Bar(
-                    x=column_data['index'],
-                    y=column_data['value'],
+                    x=column_data["index"],
+                    y=column_data["value"],
                     name=column,
                     marker=dict(color=column_colors[column]),
-                    showlegend=show_legend,
+                    legendgroup=column,  # Ensures multiple traces use the same legend entry
+                    showlegend=False,  # suppress further legend items
                 ),
                 row=row,
                 col=col,
@@ -1242,8 +1393,8 @@ def plot_facet_stacked_bars(
             if annotations:
                 for _, row_data in column_data.iterrows():
                     fig.add_annotation(
-                        x=row_data['index'],
-                        y=row_data['value'],
+                        x=row_data["index"],
+                        y=row_data["value"],
                         text=f"{row_data['value']:.{precision}f}",
                         showarrow=False,
                         row=row,
@@ -1257,8 +1408,8 @@ def plot_facet_stacked_bars(
     else:
         axis_details.append(f"[{original_column_names[0]}]")
-    if top_n_columns > 0:
-        axis_details.append(f"TOP {top_n_columns} [{original_column_names[1]}]")
+    if top_n_color > 0:
+        axis_details.append(f"TOP {top_n_color} [{original_column_names[1]}]")
     else:
         axis_details.append(f"[{original_column_names[1]}]")
@@ -1267,7 +1418,7 @@ def plot_facet_stacked_bars(
     else:
         axis_details.append(f"[{original_column_names[2]}]")
-    title = f"{caption} {', '.join(axis_details)}, n = {unique_rows:_}"
+    title = f"{caption} {', '.join(axis_details)}, n = {original_rows:_}"
     template = "plotly_dark" if os.getenv("THEME") == "dark" else "plotly"
     fig.update_layout(
         title=title,

pandas_plots/tbl.py CHANGED Viewed

@@ -15,7 +15,7 @@ from plotly.subplots import make_subplots
 from scipy import stats
 import dataframe_image as dfi
-from .hlp import wrap_text
+from .hlp import wrap_text, to_series
 import duckdb as ddb
@@ -696,7 +696,7 @@ def show_num_df(
-def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="🟠 "):
+def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str=" ", precision: int=3):
     """
     Print statistical summary for a pandas DataFrame or Series.
@@ -712,15 +712,44 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
         df (Union[pd.DataFrame, pd.Series]): Input DataFrame or Series. Only numeric columns
         in DataFrame are considered.
         show (bool, optional): Whether to print the summary. Defaults to True.
-        name (str, optional): Prefix for the summary. Defaults to "🟠 "
+        name (str, optional): Prefix for the summary. Defaults to " ".
+        precision (int, optional): Number of digits to round the results to. Defaults to 3.
     """
     if df.empty:
         return
     # * drop NA to keep scipy sane
-    df = df.dropna().copy()
+    df = df.dropna().copy()
-    def print_summary_ser(ser: pd.Series, show: bool=True, name: str=""):
+    # display(df)
+    if len(df.columns) == 1:
+        df = df.to_series()
+    pd.api.types.is_numeric_dtype(df)
+    if not (
+        # * series must be numeric
+        (isinstance(df, pd.Series)
+            and pd.api.types.is_numeric_dtype(df)
+        )
+        or
+        # * df must have 2 columns str num
+        (len(df.columns) == 2
+            and (
+                (pd.api.types.is_object_dtype(df.iloc[:, 0]))
+                or (pd.api.types.is_bool_dtype(df.iloc[:, 0]))
+                )
+            and pd.api.types.is_numeric_dtype(df.iloc[:, 1])
+        )
+    ):
+        print(f"❌ df must have 2 columns: [0] str or bool, [1] num, or be a series")
+        return
+    def print_summary_ser(ser: pd.Series, show: bool=True, name: str="", precision: int=3):
         # Calculate IQR and pass `rng=(25, 75)` to get the interquartile range
         iqr_value = stats.iqr(ser)
@@ -728,21 +757,21 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
         # ser.dropna(inplace=True)
         # Using the iqr function, we still calculate the bounds manually
-        q1 = stats.scoreatpercentile(ser, 25)
-        q3 = stats.scoreatpercentile(ser, 75)
+        q1 = round(stats.scoreatpercentile(ser, 25), precision)
+        q3 = round(stats.scoreatpercentile(ser, 75), precision)
         # Calculate upper bound directly
-        min = round(ser.min(),3)
-        med = round(ser.median(),3)
-        upper = round(q3 + 1.5 * iqr_value,3)
-        lower = round(q1 - 1.5 * iqr_value,3)
-        mean = round(ser.mean(),3)
-        std = round(ser.std(),3)
-        cv = round(ser.std() / ser.mean(),3)
-        max = round(ser.max(),3)
-        sum = round(ser.sum(),3)
-        skew = round(stats.skew(ser.dropna().tolist()),3)
-        kurto = round(stats.kurtosis(ser.dropna().tolist()),3)
+        min = round(ser.min(), precision)
+        med = round(ser.median(), precision)
+        upper = round(q3 + 1.5 * iqr_value, precision)
+        lower = round(q1 - 1.5 * iqr_value, precision)
+        mean = round(ser.mean(), precision)
+        std = round(ser.std(), precision)
+        cv = round(ser.std() / ser.mean(), precision)
+        max = round(ser.max(), precision)
+        sum = round(ser.sum(), precision)
+        skew = round(stats.skew(ser.dropna().tolist()), precision)
+        kurto = round(stats.kurtosis(ser.dropna().tolist()), precision)
         lower = min if lower < min else lower
         upper = max if upper > max else upper
@@ -750,7 +779,7 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
         # * extra care for scipy metrics, these are very vulnarable to nan
         if show:
             print(
-                f"""{name} min: {min:_} | lower: {lower:_} | q25: {q1:_} | median: {med:_} | mean: {mean:_} | q75: {q3:_} | upper: {upper:_} | max: {max:_} | std: {std:_} | cv: {cv:_} | sum: {sum:_} | skew: {skew} | kurto: {kurto}""")
+                f"""{name} -> min: {min:_} | lower: {lower:_} | q25: {q1:_} | median: {med:_} | mean: {mean:_} | q75: {q3:_} | upper: {upper:_} | max: {max:_} | std: {std:_} | cv: {cv:_} | sum: {sum:_} | skew: {skew} | kurto: {kurto}""")
         summary = {
             "min": min,
@@ -770,11 +799,22 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
         return summary
     if isinstance(df, pd.Series):
-        return print_summary_ser(df, show=show, name=name)
+        # * print serie
+        name = df.name if df.name else "series"
+        print_summary_ser(ser=df, show=show, name=name, precision=precision)
+        return
     if isinstance(df, pd.DataFrame):
-        # * only show numerics
-        for col in df.select_dtypes("number").columns:
-            summary = print_summary_ser(ser=df[col],show=show, name=col)
+        # * print for all values
+        print(f"🟧 all data")
+        name = df.columns[-1]
+        summary = print_summary_ser(ser=df.iloc[:,1], show=show, name=name, precision=precision)
+        print(f"🟧 boxes")
+        # * print for each value
+        for item in df.iloc[:,0].unique():
+            # display(df[df.iloc[:,0] == item])
+            print_summary_ser(ser=df[df.iloc[:,0] == item].iloc[:,1], show=show, name=item, precision=precision)
     return summary

{pandas_plots-0.12.6.dist-info → pandas_plots-0.12.8.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: pandas-plots
-Version: 0.12.6
+Version: 0.12.8
 Summary: A collection of helper for table handling and visualization
 Home-page: https://github.com/smeisegeier/pandas-plots
 Author: smeisegeier

pandas_plots-0.12.8.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,11 @@
+pandas_plots/hlp.py,sha256=N6NrbFagVMMX-ZnV0rIBEz82SeSoOkksfMcCap55W7E,16588
+pandas_plots/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
+pandas_plots/pls.py,sha256=8wqdoE8hXr1nwQH1Q4KelAso49txb-CyVwVKoqt_xeY,47422
+pandas_plots/tbl.py,sha256=tuTDRFaD4lKQ2fMeMCJwnJL65zXuUGVQ6uwQNVa0y6Q,31883
+pandas_plots/ven.py,sha256=2x3ACo2vSfO3q6fv-UdDQ0h1SJyt8WChBGgE5SDCdCk,11673
+pandas_plots-0.12.8.dist-info/LICENSE,sha256=6KQ5KVAAhRaB-JJKpX4cefKvRZRgI7GUPc92_2d31XY,1051
+pandas_plots-0.12.8.dist-info/METADATA,sha256=ARFgg-_KdNUg5q0qa4Zhh7o12Bh2VsTWeh-45hHO0D0,7358
+pandas_plots-0.12.8.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+pandas_plots-0.12.8.dist-info/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
+pandas_plots-0.12.8.dist-info/top_level.txt,sha256=XnaNuIHBqMmCeh_U7nKOYTwFue_SIA0wxuDgdPmnnSk,13
+pandas_plots-0.12.8.dist-info/RECORD,,

pandas_plots-0.12.6.dist-info/RECORD DELETED Viewed

@@ -1,11 +0,0 @@
-pandas_plots/hlp.py,sha256=N6NrbFagVMMX-ZnV0rIBEz82SeSoOkksfMcCap55W7E,16588
-pandas_plots/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
-pandas_plots/pls.py,sha256=DsFnWbGNmMnZ8a2qnZFwXH6VekwPFaIwZEQ9TVp6xCg,43997
-pandas_plots/tbl.py,sha256=4VvjLisPT1gSvgsLClcrhC7LIJ-_FPNla8nomGflGag,30509
-pandas_plots/ven.py,sha256=2x3ACo2vSfO3q6fv-UdDQ0h1SJyt8WChBGgE5SDCdCk,11673
-pandas_plots-0.12.6.dist-info/LICENSE,sha256=6KQ5KVAAhRaB-JJKpX4cefKvRZRgI7GUPc92_2d31XY,1051
-pandas_plots-0.12.6.dist-info/METADATA,sha256=-mCMgoWTwG6HSL8JtuYvwM1LCkzglJm3aIocaUMijO4,7358
-pandas_plots-0.12.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-pandas_plots-0.12.6.dist-info/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
-pandas_plots-0.12.6.dist-info/top_level.txt,sha256=XnaNuIHBqMmCeh_U7nKOYTwFue_SIA0wxuDgdPmnnSk,13
-pandas_plots-0.12.6.dist-info/RECORD,,

{pandas_plots-0.12.6.dist-info → pandas_plots-0.12.8.dist-info}/LICENSE RENAMED Viewed

File without changes

{pandas_plots-0.12.6.dist-info → pandas_plots-0.12.8.dist-info}/WHEEL RENAMED Viewed

File without changes

{pandas_plots-0.12.6.dist-info → pandas_plots-0.12.8.dist-info}/pii.py RENAMED Viewed

File without changes

{pandas_plots-0.12.6.dist-info → pandas_plots-0.12.8.dist-info}/top_level.txt RENAMED Viewed

File without changes

pandas-plots 0.12.6__py3-none-any.whl → 0.12.8__py3-none-any.whl

pandas-plots 0.12.6py3-none-any.whl → 0.12.8py3-none-any.whl