PyPI - pandas-plots - Versions diffs - 0.12.5__tar.gz → 0.12.7__tar.gz - Mend

pandas-plots 0.12.5tar.gz → 0.12.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

{pandas_plots-0.12.5/src/pandas_plots.egg-info → pandas_plots-0.12.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: pandas-plots
-Version: 0.12.5
+Version: 0.12.7
 Summary: A collection of helper for table handling and visualization
 Home-page: https://github.com/smeisegeier/pandas-plots
 Author: smeisegeier

{pandas_plots-0.12.5 → pandas_plots-0.12.7}/setup.cfg RENAMED Viewed

@@ -1,6 +1,6 @@
 [metadata]
 name = pandas-plots
-version = 0.12.5
+version = 0.12.7
 author = smeisegeier
 author_email = dexterDSDo@googlemail.com
 description = A collection of helper for table handling and visualization

pandas_plots-0.12.7/src/pandas_plots/pii.py ADDED Viewed

@@ -0,0 +1,76 @@
+import pandas as pd
+import re
+def remove_pii(
+    series: pd.Series,
+    verbose: bool = True,
+    logging: bool = False,
+    custom_regex="",
+) -> pd.Index:
+    """
+    Remove personally identifiable information (PII) from the given column.
+    Parameters:
+    - series: A pandas Series representing a column in a DataFrame.
+    - verbose: If True, print pii items
+    - logging: If True, write pii items into the file .pii.log
+    - custom_regex: Regex that is injected into detection
+    Returns:
+    - index object with indexes of all pii items
+    Remarks:
+    - df.drop(axis=0, index=result, inplace=True)
+    """
+    # * reject empty columns
+    assert len(series) > 0
+    col = series.copy()
+    # * na must be dropped to ensure processsing
+    col.dropna(inplace=True)
+    # * find terms
+    _terms = frozenset(["lösch", "herr", "frau", "strasse", "klinik"])
+    idx_terms = col[
+        col.str.contains(
+            "|".join(_terms),
+            case=False,
+            regex=True,
+        )
+    ].index
+    # # * optional: search for terms in whole df
+    # df.apply(lambda row: row.astype(str).str.contains('test', case=False, regex=True).any(), axis=1)
+    # # * find dates
+    ptr_date = r"\d{2}\.\d{2}\.\d{4}"
+    idx_date = col[col.str.contains(ptr_date, regex=True)].index
+    # * dr
+    ptr_dr = r"[D|d][R|r]\. | Fr\. | Hr\. | PD "
+    idx_dr = col[col.str.contains(ptr_dr, regex=True)].index
+    # * custom
+    idx_custom = (
+        col[col.str.contains(custom_regex, regex=True)].index
+        if custom_regex
+        else pd.Index([])
+    )
+    idx_all = idx_terms.union(idx_date).union(idx_dr).union(idx_custom)
+    if verbose:
+        # print(f"found: {idx_dr.__len__()} dr | {idx_date.__len__()} date | {idx_terms.__len__()} terms")
+        print(f"found {idx_all.__len__():_} pii items:")
+        print(col.loc[idx_all].tolist())
+    if logging:  # Assuming logging is defined and has the correct value
+        data = col.loc[idx_all]  # Assuming col and idx_all are defined
+        with open(".pii.log", "w") as f:
+            # ! when using str(), it will give only a summary!
+            f.write(data.to_string(index=True))
+    return idx_all

{pandas_plots-0.12.5 → pandas_plots-0.12.7}/src/pandas_plots/pls.py RENAMED Viewed

@@ -12,6 +12,7 @@ from matplotlib import pyplot as plt
 from plotly import express as px
 import plotly.graph_objects as go
 from plotly.subplots import make_subplots
+import plotly # needed for return types
 from .hlp import *
 from .tbl import print_summary
@@ -189,7 +190,7 @@ def plot_stacked_bars(
     png_path: Path | str = None,
     color_palette: str = "Plotly",
     null_label: str = "<NA>",
-) -> object:
+) -> plotly.graph_objects:
     """
     Generates a stacked bar plot using the provided DataFrame.
@@ -220,7 +221,7 @@ def plot_stacked_bars(
     - A Plotly figure object representing the stacked bar chart.
     """
     BAR_LENGTH_MULTIPLIER = 1.05
     # * 2 axis means at least 2 columns
     if len(df.columns) < 2 or len(df.columns) > 3:
         print("❌ df must have exactly 2 or 3 columns")
@@ -256,16 +257,6 @@ def plot_stacked_bars(
     col_index = df.columns[0] if not swap else df.columns[1]
     col_color = df.columns[1] if not swap else df.columns[0]
-    # * assign colors to columns
-    unique_colors = sorted(df[col_color].unique())
-    column_colors = assign_column_colors(unique_colors, color_palette, null_label)
-    # * add total as aggregation of df
-    if show_total:
-        df_total = df.copy()
-        df_total[col_index] = " TOTAL"  # add space to make this item first
-        df = pd.concat([df, df_total])
     # * ensure df is grouped to prevent false aggregations
     df = (
         df.groupby([df.columns[0], df.columns[1]])
@@ -273,8 +264,32 @@ def plot_stacked_bars(
         .sum()
         .reset_index()
     )
+    # * add total as aggregation of df
+    if show_total:
+        df_total = df.groupby(df.columns[1], observed=True, as_index=False)[df.columns[2]].sum()
+        df_total[df.columns[0]] = " Total"
+        df = pd.concat([df, df_total], ignore_index=True)
+    # * apply top_n, reduce df
+    n_col = top_n_color if top_n_color > 0 else None
+    n_idx = top_n_index if top_n_index > 0 else None
+    unique_colors = sorted(
+        df.groupby(col_color)[df.columns[2]]
+        .sum()
+        .sort_values(ascending=False)
+        .index.tolist()[:n_col]
+    )
-    # * Sorting logic based on sort_values
+    unique_idx = df[col_index].sort_values().unique()[:n_idx]
+    df = df[df[col_color].isin(unique_colors)]#.sort_values(by=[col_index, col_color])
+    df = df[df[col_index].isin(unique_idx)]#.sort_values(by=[col_index, col_color])
+    # # * Sorting logic based on sort_values
     if sort_values:
         sort_order = (
             df.groupby(col_index)[df.columns[2]].sum().sort_values(ascending=False).index
@@ -282,9 +297,14 @@ def plot_stacked_bars(
     else:
         sort_order = sorted(df[col_index].unique())  # Alphabetical order
-    # * Convert to categorical with explicit ordering
+    # # * Convert to categorical with explicit ordering
     df[col_index] = pd.Categorical(df[col_index], categories=sort_order, ordered=True)
+    column_colors = assign_column_colors(
+        columns=unique_colors,
+        color_palette=color_palette,
+        null_label=null_label
+        )
     # * calculate n
     divider = 2 if show_total else 1
@@ -297,13 +317,24 @@ def plot_stacked_bars(
     _title_str_n = f", n={n:_}"
     caption = _set_caption(caption)
+        # * after grouping add cols for pct and formatting
+    df["pct"] = df[df.columns[2]].apply(lambda x: f"{(x / n) * 100:.{precision}f}%")
+    # * format output
+    df["cnt_str"] = df[df.columns[2]].apply(lambda x: f"{x:_.{precision}f}")
+    divider2 = "<br>" if orientation == "v" else " "
+    df["cnt_pct_str"] = df.apply(
+        lambda row: f"{row['cnt_str']}{divider2}({row['pct']})", axis=1
+    )
     # * plot
     fig = px.bar(
         df,
         x=col_index if orientation == "v" else df.columns[2],
         y=df.columns[2] if orientation == "v" else col_index,
         color=col_color,
-        text=df.columns[2],
+        text="cnt_pct_str" if normalize else "cnt_str",
         orientation=orientation,
         title=title
         or f"{caption}{_title_str_top_index}[{col_index}] by {_title_str_top_color}[{col_color}]{_title_str_null}{_title_str_n}",
@@ -312,7 +343,9 @@ def plot_stacked_bars(
         height=height,
         color_discrete_map=column_colors,  # Use assigned colors
         category_orders={col_index: list(df[col_index].cat.categories)},  # <- Add this line
     )
         # * get longest bar
     bar_max = (
         df.groupby(col_index)[df.columns[2]].sum().sort_values(ascending=False).iloc[0]
@@ -344,13 +377,14 @@ def plot_stacked_bars(
     if orientation == "h":
         if relative:
             fig.update_xaxes(dtick=5)
-        elif normalize:
-            fig.update_xaxes(dtick=0.05)
+        # bug dticks are ultra dense
+        # elif normalize:
+        #     fig.update_xaxes(dtick=0.05)
     else:
         if relative:
             fig.update_yaxes(dtick=5)
-        elif normalize:
-            fig.update_yaxes(dtick=0.05)
+        # elif normalize:
+        #     fig.update_yaxes(dtick=0.05)
     # * show grids, set to smaller distance on pct scale
     fig.update_xaxes(showgrid=True, gridwidth=1)
@@ -474,7 +508,10 @@ def plot_bars(
     # * after grouping add cols for pct and formatting
     df["pct"] = df[df.columns[1]] / n
+    # * format output
     df["cnt_str"] = df[df.columns[1]].apply(lambda x: f"{x:_.{precision}f}")
     divider = "<br>" if orientation == "v" else " "
     df["cnt_pct_str"] = df.apply(
         lambda row: f"{row['cnt_str']}{divider}({row['pct']:.1%})", axis=1
@@ -961,7 +998,8 @@ def plot_box(
     fig.show("png")
     if summary:
-        print_summary(ser)
+        # * if only series is provided, col name is None
+        print_summary(ser.to_frame())
     # * save to png if path is provided
     if png_path is not None:
@@ -976,7 +1014,7 @@ def plot_boxes(
     points: Literal["all", "outliers", "suspectedoutliers", None] = None,
     precision: int = 2,
     height: int = 600,
-    width: int = 800,
+    width: int = 1200,
     annotations: bool = True,
     summary: bool = True,
     title: str = None,
@@ -1003,7 +1041,7 @@ def plot_boxes(
     if (
         len(df.columns) != 2
         or not (
-            (pd.api.types.is_string_dtype(df.iloc[:, 0]))
+            (pd.api.types.is_object_dtype(df.iloc[:, 0]))
             or (pd.api.types.is_bool_dtype(df.iloc[:, 0]))
         )
         or not pd.api.types.is_numeric_dtype(df.iloc[:, 1])
@@ -1109,7 +1147,7 @@ def plot_boxes(
     fig.show("png")
     if summary:
-        print_summary(df)
+        print_summary(df=df, precision=precision)
     # * save to png if path is provided
     if png_path is not None:
@@ -1175,8 +1213,23 @@ def plot_facet_stacked_bars(
     aggregated_df = aggregate_data(df, top_n_index, top_n_columns, top_n_facet, null_label)
-    facets = aggregated_df['facet'].unique()
-    columns = sorted(aggregated_df['col'].unique())
+    # facets = aggregated_df['facet'].unique()
+    facets = sorted(aggregated_df['facet'].unique())  # Ensure facets are sorted consistently
+    if top_n_columns > 0:
+        top_columns = aggregated_df.groupby('col', observed=True)['value'].sum().nlargest(top_n_columns).index.tolist()
+        # aggregated_df['col'] = aggregated_df['col'].apply(lambda x: x if x in top_columns else "<other>")
+        # aggregated_df['col'] = pd.Categorical(aggregated_df['col'], categories=top_columns + ["<other>"], ordered=True)
+        # aggregated_df['col'] = pd.Categorical(
+        #     aggregated_df['col'].map(lambda x: x if x in top_columns else "<other>"),
+        #     categories=top_columns + ["<other>"],
+        #     ordered=True
+        # )
+        aggregated_df['col'] = aggregated_df['col'].apply(lambda x: x if x in top_columns else "<other>")
+    # columns = sorted(aggregated_df['col'].unique())
+    columns = aggregated_df.groupby('col', observed=True)['value'].sum().sort_values(ascending=False).index.tolist()
     column_colors = assign_column_colors(columns, color_palette, null_label)
     fig = make_subplots(

{pandas_plots-0.12.5 → pandas_plots-0.12.7}/src/pandas_plots/tbl.py RENAMED Viewed

@@ -15,7 +15,7 @@ from plotly.subplots import make_subplots
 from scipy import stats
 import dataframe_image as dfi
-from .hlp import wrap_text
+from .hlp import wrap_text, to_series
 import duckdb as ddb
@@ -696,7 +696,7 @@ def show_num_df(
-def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="🟠 "):
+def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str=" ", precision: int=3):
     """
     Print statistical summary for a pandas DataFrame or Series.
@@ -712,15 +712,44 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
         df (Union[pd.DataFrame, pd.Series]): Input DataFrame or Series. Only numeric columns
         in DataFrame are considered.
         show (bool, optional): Whether to print the summary. Defaults to True.
-        name (str, optional): Prefix for the summary. Defaults to "🟠 "
+        name (str, optional): Prefix for the summary. Defaults to " ".
+        precision (int, optional): Number of digits to round the results to. Defaults to 3.
     """
     if df.empty:
         return
     # * drop NA to keep scipy sane
-    df = df.dropna().copy()
+    df = df.dropna().copy()
-    def print_summary_ser(ser: pd.Series, show: bool=True, name: str=""):
+    # display(df)
+    if len(df.columns) == 1:
+        df = df.to_series()
+    pd.api.types.is_numeric_dtype(df)
+    if not (
+        # * series must be numeric
+        (isinstance(df, pd.Series)
+            and pd.api.types.is_numeric_dtype(df)
+        )
+        or
+        # * df must have 2 columns str num
+        (len(df.columns) == 2
+            and (
+                (pd.api.types.is_object_dtype(df.iloc[:, 0]))
+                or (pd.api.types.is_bool_dtype(df.iloc[:, 0]))
+                )
+            and pd.api.types.is_numeric_dtype(df.iloc[:, 1])
+        )
+    ):
+        print(f"❌ df must have 2 columns: [0] str or bool, [1] num, or be a series")
+        return
+    def print_summary_ser(ser: pd.Series, show: bool=True, name: str="", precision: int=3):
         # Calculate IQR and pass `rng=(25, 75)` to get the interquartile range
         iqr_value = stats.iqr(ser)
@@ -728,21 +757,21 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
         # ser.dropna(inplace=True)
         # Using the iqr function, we still calculate the bounds manually
-        q1 = stats.scoreatpercentile(ser, 25)
-        q3 = stats.scoreatpercentile(ser, 75)
+        q1 = round(stats.scoreatpercentile(ser, 25), precision)
+        q3 = round(stats.scoreatpercentile(ser, 75), precision)
         # Calculate upper bound directly
-        min = round(ser.min(),3)
-        med = round(ser.median(),3)
-        upper = round(q3 + 1.5 * iqr_value,3)
-        lower = round(q1 - 1.5 * iqr_value,3)
-        mean = round(ser.mean(),3)
-        std = round(ser.std(),3)
-        cv = round(ser.std() / ser.mean(),3)
-        max = round(ser.max(),3)
-        sum = round(ser.sum(),3)
-        skew = round(stats.skew(ser.dropna().tolist()),3)
-        kurto = round(stats.kurtosis(ser.dropna().tolist()),3)
+        min = round(ser.min(), precision)
+        med = round(ser.median(), precision)
+        upper = round(q3 + 1.5 * iqr_value, precision)
+        lower = round(q1 - 1.5 * iqr_value, precision)
+        mean = round(ser.mean(), precision)
+        std = round(ser.std(), precision)
+        cv = round(ser.std() / ser.mean(), precision)
+        max = round(ser.max(), precision)
+        sum = round(ser.sum(), precision)
+        skew = round(stats.skew(ser.dropna().tolist()), precision)
+        kurto = round(stats.kurtosis(ser.dropna().tolist()), precision)
         lower = min if lower < min else lower
         upper = max if upper > max else upper
@@ -750,7 +779,7 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
         # * extra care for scipy metrics, these are very vulnarable to nan
         if show:
             print(
-                f"""{name} min: {min:_} | lower: {lower:_} | q25: {q1:_} | median: {med:_} | mean: {mean:_} | q75: {q3:_} | upper: {upper:_} | max: {max:_} | std: {std:_} | cv: {cv:_} | sum: {sum:_} | skew: {skew} | kurto: {kurto}""")
+                f"""{name} -> min: {min:_} | lower: {lower:_} | q25: {q1:_} | median: {med:_} | mean: {mean:_} | q75: {q3:_} | upper: {upper:_} | max: {max:_} | std: {std:_} | cv: {cv:_} | sum: {sum:_} | skew: {skew} | kurto: {kurto}""")
         summary = {
             "min": min,
@@ -770,11 +799,22 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
         return summary
     if isinstance(df, pd.Series):
-        return print_summary_ser(df, show=show, name=name)
+        # * print serie
+        name = df.name if df.name else "series"
+        print_summary_ser(ser=df, show=show, name=name, precision=precision)
+        return
     if isinstance(df, pd.DataFrame):
-        # * only show numerics
-        for col in df.select_dtypes("number").columns:
-            summary = print_summary_ser(ser=df[col],show=show, name=col)
+        # * print for all values
+        print(f"🟧 all data")
+        name = df.columns[-1]
+        summary = print_summary_ser(ser=df.iloc[:,1], show=show, name=name, precision=precision)
+        print(f"🟧 boxes")
+        # * print for each value
+        for item in df.iloc[:,0].unique():
+            # display(df[df.iloc[:,0] == item])
+            print_summary_ser(ser=df[df.iloc[:,0] == item].iloc[:,1], show=show, name=item, precision=precision)
     return summary

{pandas_plots-0.12.5 → pandas_plots-0.12.7/src/pandas_plots.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: pandas-plots
-Version: 0.12.5
+Version: 0.12.7
 Summary: A collection of helper for table handling and visualization
 Home-page: https://github.com/smeisegeier/pandas-plots
 Author: smeisegeier

{pandas_plots-0.12.5 → pandas_plots-0.12.7}/src/pandas_plots.egg-info/SOURCES.txt RENAMED Viewed

@@ -10,5 +10,6 @@ src/pandas_plots/ven.py
 src/pandas_plots.egg-info/PKG-INFO
 src/pandas_plots.egg-info/SOURCES.txt
 src/pandas_plots.egg-info/dependency_links.txt
+src/pandas_plots.egg-info/pii.py
 src/pandas_plots.egg-info/requires.txt
 src/pandas_plots.egg-info/top_level.txt