pandas-plots 0.11.28__tar.gz → 0.12.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: pandas-plots
3
- Version: 0.11.28
3
+ Version: 0.12.1
4
4
  Summary: A collection of helper for table handling and visualization
5
5
  Home-page: https://github.com/smeisegeier/pandas-plots
6
6
  Author: smeisegeier
@@ -85,6 +85,7 @@ tbl.show_num_df(
85
85
  - `descr_db()` a very short descr for a `duckdb` relation
86
86
  - `pivot_df()` gets a pivot table of a 3 column dataframe (or 2 columns if no weights are given)
87
87
  - `print_summary()` shows statistics for a pandas DataFrame or Series
88
+ <br>
88
89
 
89
90
  - `pls` for plotly visualizations
90
91
  - `plot_box()` auto annotated boxplot w/ violin option
@@ -95,10 +96,13 @@ tbl.show_num_df(
95
96
  - `plot_histogram()` histogram for one or more **numerical** columns
96
97
  - `plot_joints()` a joint plot for **exactly two numerical** columns
97
98
  - `plot_quadrants()` quickly shows a 2x2 heatmap
99
+ - 🆕 `plot_stacked_bars()` shows stacked bars for a facet value as subplots
100
+ <br>
98
101
 
99
102
  - `ven` offers functions for _venn diagrams_
100
103
  - `show_venn2()` displays a venn diagram for 2 sets
101
104
  - `show_venn3()` displays a venn diagram for 3 sets
105
+ <br>
102
106
 
103
107
  - `hlp` contains some (variety) helper functions
104
108
  - `to_series()` converts a dataframe to a series (`🚨 breaking change`)
@@ -110,6 +114,7 @@ tbl.show_num_df(
110
114
  - `show_package_version` prints version of a list of packages
111
115
  - `get_os` helps to identify and ensure operating system at runtime
112
116
  - `🆕 add_bitmask_label()` adds a column to the data that resolves a bitmask column into human-readable labels
117
+ <br>
113
118
 
114
119
  - `pii` has routines for handling of personally identifiable information
115
120
  - `remove_pii()` logs and deletes pii from a series
@@ -50,6 +50,7 @@ tbl.show_num_df(
50
50
  - `descr_db()` a very short descr for a `duckdb` relation
51
51
  - `pivot_df()` gets a pivot table of a 3 column dataframe (or 2 columns if no weights are given)
52
52
  - `print_summary()` shows statistics for a pandas DataFrame or Series
53
+ <br>
53
54
 
54
55
  - `pls` for plotly visualizations
55
56
  - `plot_box()` auto annotated boxplot w/ violin option
@@ -60,10 +61,13 @@ tbl.show_num_df(
60
61
  - `plot_histogram()` histogram for one or more **numerical** columns
61
62
  - `plot_joints()` a joint plot for **exactly two numerical** columns
62
63
  - `plot_quadrants()` quickly shows a 2x2 heatmap
64
+ - 🆕 `plot_stacked_bars()` shows stacked bars for a facet value as subplots
65
+ <br>
63
66
 
64
67
  - `ven` offers functions for _venn diagrams_
65
68
  - `show_venn2()` displays a venn diagram for 2 sets
66
69
  - `show_venn3()` displays a venn diagram for 3 sets
70
+ <br>
67
71
 
68
72
  - `hlp` contains some (variety) helper functions
69
73
  - `to_series()` converts a dataframe to a series (`🚨 breaking change`)
@@ -75,6 +79,7 @@ tbl.show_num_df(
75
79
  - `show_package_version` prints version of a list of packages
76
80
  - `get_os` helps to identify and ensure operating system at runtime
77
81
  - `🆕 add_bitmask_label()` adds a column to the data that resolves a bitmask column into human-readable labels
82
+ <br>
78
83
 
79
84
  - `pii` has routines for handling of personally identifiable information
80
85
  - `remove_pii()` logs and deletes pii from a series
@@ -1,6 +1,6 @@
1
1
  [metadata]
2
2
  name = pandas-plots
3
- version = 0.11.28
3
+ version = 0.12.1
4
4
  author = smeisegeier
5
5
  author_email = dexterDSDo@googlemail.com
6
6
  description = A collection of helper for table handling and visualization
@@ -4,12 +4,14 @@ import warnings
4
4
  warnings.filterwarnings("ignore")
5
5
 
6
6
  import os
7
- from typing import Literal
7
+ from typing import Optional, Literal
8
8
 
9
9
  import pandas as pd
10
10
  import seaborn as sb
11
11
  from matplotlib import pyplot as plt
12
12
  from plotly import express as px
13
+ import plotly.graph_objects as go
14
+ from plotly.subplots import make_subplots
13
15
 
14
16
  from .hlp import *
15
17
  from .tbl import print_summary
@@ -1092,52 +1094,194 @@ def plot_boxes(
1092
1094
  return fig
1093
1095
 
1094
1096
 
1095
- # def plot_ci_bars_DEPR(df: pd.DataFrame, dropna: bool = True, precision: int = 2) -> None:
1096
- # """
1097
- # Generate a bar plot with confidence intervals for a given DataFrame.
1098
-
1099
- # Args:
1100
- # df (pd.DataFrame): The DataFrame to generate the plot from.
1101
- # dropna (bool, optional): Whether to drop NaN values from the DataFrame before plotting. Defaults to False.
1102
-
1103
- # Returns:
1104
- # None
1105
- # """
1106
- # # * if a df is given, convert to series
1107
- # df = df_to_series(df) # _df_to_ser(df)
1108
- # if df is None:
1109
- # return
1110
- # # display(df[df.index.isna()])
1111
-
1112
- # # * nulls are hidden by default in plotly etc, so give them a proper category
1113
- # if dropna:
1114
- # df = df.dropna()
1115
- # else:
1116
- # df.index = df.index.fillna("<NA>")
1117
-
1118
- # if os.getenv("THEME") == "dark":
1119
- # # sb.set_theme(style="darkgrid")
1120
- # # sb.set_theme(style="darkgrid")
1121
- # plt.style.use("dark_background")
1122
- # else:
1123
- # plt.style.use("ggplot")
1124
-
1125
- # # * generate plot
1126
- # fig, ax = plt.subplots(figsize=(8, 6))
1127
-
1128
- # sb.barplot(
1129
- # data=df,
1130
- # palette="tab10",
1131
- # capsize=0.1,
1132
- # ax=ax,
1133
- # errorbar=("ci", 95),
1134
- # )
1135
- # # * add the annotation
1136
- # for bar in ax.containers:
1137
- # ax.bar_label(bar, fmt=f"%.{precision}f", label_type="center")
1138
-
1139
- # ax.set(title=f"[{df.name}] for [{df.index.name}] on 95% ci")
1140
-
1141
- # plt.show()
1142
-
1143
- # return df
1097
+ def aggregate_data(df: pd.DataFrame, top_n_index: int, top_n_category: int, top_n_facet: int, null_label: str) -> pd.DataFrame:
1098
+ """
1099
+ Aggregates the data, ensuring each combination of 'index', 'col', and 'facet' is unique with summed 'value'.
1100
+
1101
+ Args:
1102
+ df (pd.DataFrame): Input DataFrame.
1103
+ top_n_index (int): Top N values of the first column to keep. 0 means take all.
1104
+ top_n_category (int): Top N values of the second column to keep. 0 means take all.
1105
+ top_n_facet (int): Top N values of the third column to keep. 0 means take all.
1106
+ null_label (str): Label for null values.
1107
+
1108
+ Returns:
1109
+ pd.DataFrame: Aggregated and filtered dataset.
1110
+ """
1111
+ # Replace nulls with a placeholder for consistent handling
1112
+ for col in ['index', 'col', 'facet']: # Skip 'value' column (numeric)
1113
+ df[col] = df[col].fillna(null_label)
1114
+
1115
+ # Aggregate data to ensure unique combinations
1116
+ aggregated_df = df.groupby(['index', 'col', 'facet'], as_index=False)['value'].sum()
1117
+
1118
+ # Reduce data based on top_n parameters
1119
+ if top_n_index > 0:
1120
+ top_indexes = aggregated_df.groupby('index')['value'].sum().nlargest(top_n_index).index
1121
+ aggregated_df = aggregated_df[aggregated_df['index'].isin(top_indexes)]
1122
+ if top_n_category > 0:
1123
+ top_categories = aggregated_df.groupby('col')['value'].sum().nlargest(top_n_category).index
1124
+ aggregated_df = aggregated_df[aggregated_df['col'].isin(top_categories)]
1125
+ if top_n_facet > 0:
1126
+ top_facets = aggregated_df.groupby('facet')['value'].sum().nlargest(top_n_facet).index
1127
+ aggregated_df = aggregated_df[aggregated_df['facet'].isin(top_facets)]
1128
+
1129
+ return aggregated_df
1130
+
1131
+
1132
+ def assign_column_colors(columns: pd.Series, color_palette: str, null_label: str) -> dict:
1133
+ """
1134
+ Assign colors to columns using the selected color palette and handle null columns separately.
1135
+
1136
+ Args:
1137
+ columns (pd.Series): The unique column categories.
1138
+ color_palette (str): The name of the color palette.
1139
+ null_label (str): The label to be used for null values.
1140
+
1141
+ Returns:
1142
+ dict: Mapping of column values to colors.
1143
+ """
1144
+ if hasattr(px.colors.qualitative, color_palette):
1145
+ color_scale = px.colors.qualitative.__dict__.get(color_palette, px.colors.qualitative.Plotly)
1146
+ else:
1147
+ color_scale = px.colors.sequential.__dict__.get(color_palette, px.colors.sequential.Viridis)
1148
+
1149
+ column_colors = {
1150
+ column: color_scale[i % len(color_scale)]
1151
+ for i, column in enumerate(columns) if column != null_label
1152
+ }
1153
+ column_colors[null_label] = "gray" # Assign gray to null columns
1154
+
1155
+ return column_colors
1156
+
1157
+
1158
+ def plot_facet_stacked_bars(
1159
+ df: pd.DataFrame,
1160
+ subplots_per_row: int = 4,
1161
+ top_n_index: int = 0,
1162
+ top_n_category: int = 0,
1163
+ top_n_facet: int = 0,
1164
+ null_label: str = "<NA>",
1165
+ subplot_size: int = 300,
1166
+ color_palette: str = "Plotly",
1167
+ caption: str = "",
1168
+ renderer: Optional[Literal["png", "svg"]] = "png",
1169
+ annotations: bool = False,
1170
+ precision: int = 0,
1171
+ png_path: Optional[Path] = None,
1172
+ ) -> pd.DataFrame:
1173
+ """
1174
+ Create a grid of stacked bar charts.
1175
+
1176
+ Args:
1177
+ df (pd.DataFrame): DataFrame with 3 or 4 columns.
1178
+ subplots_per_row (int): Number of subplots per row.
1179
+ top_n_index (int): Top N index values to keep.
1180
+ top_n_category (int): Top N category values to keep.
1181
+ top_n_facet (int): Top N facet values to keep.
1182
+ null_label (str): Label for null values.
1183
+ subplot_size (int): Size of each subplot.
1184
+ color_palette (str): Name of the color palette.
1185
+ caption (str): Optional caption to prepend to the title.
1186
+ renderer (Optional[Literal["png", "svg"]]): Renderer for saving the image.
1187
+ annotations (bool): Whether to show annotations in the subplots.
1188
+ precision (int): Decimal precision for annotations.
1189
+ png_path (Optional[Path]): Path to save the image.
1190
+
1191
+ Returns:
1192
+ pd.DataFrame: Aggregated dataset used for plotting.
1193
+ """
1194
+ # Validate input DataFrame
1195
+ if not (df.shape[1] == 3 or df.shape[1] == 4):
1196
+ raise ValueError("Input DataFrame must have 3 or 4 columns.")
1197
+
1198
+ # Store original column names
1199
+ original_column_names = df.columns.tolist()
1200
+
1201
+ # Rename columns to more concise names
1202
+ if df.shape[1] == 3:
1203
+ df.columns = ['index', 'col', 'facet']
1204
+ df['value'] = 1 # Treat all rows as having a value of 1
1205
+ elif df.shape[1] == 4:
1206
+ df.columns = ['index', 'col', 'facet', 'value']
1207
+
1208
+ # Aggregate and filter data
1209
+ aggregated_df = aggregate_data(df, top_n_index, top_n_category, top_n_facet, null_label)
1210
+
1211
+ # Get unique facets and columns
1212
+ facets = aggregated_df['facet'].unique()
1213
+ columns = aggregated_df['col'].unique()
1214
+
1215
+ # Assign colors to columns
1216
+ column_colors = assign_column_colors(columns, color_palette, null_label)
1217
+
1218
+ # Create subplot grid
1219
+ fig = make_subplots(
1220
+ rows=-(-len(facets) // subplots_per_row), # Ceiling division
1221
+ cols=min(subplots_per_row, len(facets)),
1222
+ subplot_titles=facets,
1223
+ )
1224
+
1225
+ # Add traces for each facet
1226
+ added_to_legend = set() # Track which columns have been added to the legend
1227
+ for i, facet in enumerate(facets):
1228
+ facet_data = aggregated_df[aggregated_df['facet'] == facet]
1229
+ row = (i // subplots_per_row) + 1
1230
+ col = (i % subplots_per_row) + 1
1231
+
1232
+ for column in columns:
1233
+ column_data = facet_data[facet_data['col'] == column]
1234
+ show_legend = column not in added_to_legend
1235
+ if show_legend:
1236
+ added_to_legend.add(column)
1237
+
1238
+ fig.add_trace(
1239
+ go.Bar(
1240
+ x=column_data['index'],
1241
+ y=column_data['value'],
1242
+ name=column,
1243
+ marker=dict(color=column_colors[column]),
1244
+ showlegend=show_legend,
1245
+ ),
1246
+ row=row,
1247
+ col=col,
1248
+ )
1249
+
1250
+ # Add annotations if annotations is True
1251
+ if annotations:
1252
+ for _, row_data in column_data.iterrows():
1253
+ fig.add_annotation(
1254
+ x=row_data['index'],
1255
+ y=row_data['value'],
1256
+ text=f"{row_data['value']:.{precision}f}",
1257
+ showarrow=False,
1258
+ row=row,
1259
+ col=col,
1260
+ )
1261
+
1262
+ # Create the dynamic title
1263
+ unique_rows = len(aggregated_df)
1264
+ title = f"{caption} [{original_column_names[0]}] by [{original_column_names[1]}] by [{original_column_names[2]}], n = {unique_rows:_}"
1265
+
1266
+ # Update layout for stacking, title, and theme
1267
+ template = "plotly_dark" if os.getenv("THEME") == "dark" else "plotly"
1268
+ fig.update_layout(
1269
+ title=title,
1270
+ barmode="stack", # Enable stacking
1271
+ height=subplot_size * (-(-len(facets) // subplots_per_row)),
1272
+ width=subplot_size * min(subplots_per_row, len(facets)),
1273
+ showlegend=True,
1274
+ template=template,
1275
+ )
1276
+
1277
+ # Save the figure if png_path is specified
1278
+ if png_path:
1279
+ png_path = Path(png_path)
1280
+ fig.write_image(str(png_path))
1281
+
1282
+ # Show the figure with the renderer specified
1283
+ fig.show(renderer)
1284
+
1285
+ # Return the aggregated dataset
1286
+ return aggregated_df
1287
+
@@ -699,7 +699,9 @@ def show_num_df(
699
699
 
700
700
  return out
701
701
 
702
- def print_summary(df: pd.DataFrame | pd.Series, name: str="🟠 "):
702
+
703
+
704
+ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="🟠 "):
703
705
  """
704
706
  Print statistical summary for a pandas DataFrame or Series.
705
707
 
@@ -712,11 +714,13 @@ def print_summary(df: pd.DataFrame | pd.Series, name: str="🟠 "):
712
714
  Args:
713
715
  df (Union[pd.DataFrame, pd.Series]): Input DataFrame or Series. Only numeric columns
714
716
  in DataFrame are considered.
717
+ show (bool, optional): Whether to print the summary. Defaults to True.
718
+ name (str, optional): Prefix for the summary. Defaults to "🟠 "
715
719
  """
716
720
  if df.empty:
717
721
  return
718
722
 
719
- def print_summary_ser(ser: pd.Series, name: str=""):
723
+ def print_summary_ser(ser: pd.Series, show: bool=True, name: str=""):
720
724
  # Calculate IQR and pass `rng=(25, 75)` to get the interquartile range
721
725
  iqr_value = stats.iqr(ser)
722
726
 
@@ -744,14 +748,32 @@ def print_summary(df: pd.DataFrame | pd.Series, name: str="🟠 "):
744
748
  upper = max if upper > max else upper
745
749
 
746
750
  # * extra care for scipy metrics, these are very vulnarable to nan
747
- print(
748
- f"""{name} min: {min:_} | lower: {lower:_} | q25: {q1:_} | median: {med:_} | mean: {mean:_} | q75: {q3:_} | upper: {upper:_} | max: {max:_} | std: {std:_} | cv: {cv:_} | sum: {sum:_} | skew: {skew} | kurto: {kurto}""")
751
+ if show:
752
+ print(
753
+ f"""{name} min: {min:_} | lower: {lower:_} | q25: {q1:_} | median: {med:_} | mean: {mean:_} | q75: {q3:_} | upper: {upper:_} | max: {max:_} | std: {std:_} | cv: {cv:_} | sum: {sum:_} | skew: {skew} | kurto: {kurto}""")
754
+
755
+ summary = {
756
+ "min": min,
757
+ "lower": lower,
758
+ "q25": q1,
759
+ "median": med,
760
+ "mean": mean,
761
+ "q75": q3,
762
+ "upper": upper,
763
+ "max": max,
764
+ "std": std,
765
+ "cv": cv,
766
+ "sum": sum,
767
+ "skew": skew,
768
+ "kurto": kurto
769
+ }
770
+ return summary
749
771
 
750
772
  if isinstance(df, pd.Series):
751
- print_summary_ser(df, name)
752
- return
773
+ return print_summary_ser(df, show=show, name=name)
774
+
753
775
  if isinstance(df, pd.DataFrame):
754
776
  # * only show numerics
755
777
  for col in df.select_dtypes("number").columns:
756
- print_summary_ser(ser=df[col], name=col)
757
- return
778
+ summary = print_summary_ser(ser=df[col],show=show, name=col)
779
+ return summary
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: pandas-plots
3
- Version: 0.11.28
3
+ Version: 0.12.1
4
4
  Summary: A collection of helper for table handling and visualization
5
5
  Home-page: https://github.com/smeisegeier/pandas-plots
6
6
  Author: smeisegeier
@@ -85,6 +85,7 @@ tbl.show_num_df(
85
85
  - `descr_db()` a very short descr for a `duckdb` relation
86
86
  - `pivot_df()` gets a pivot table of a 3 column dataframe (or 2 columns if no weights are given)
87
87
  - `print_summary()` shows statistics for a pandas DataFrame or Series
88
+ <br>
88
89
 
89
90
  - `pls` for plotly visualizations
90
91
  - `plot_box()` auto annotated boxplot w/ violin option
@@ -95,10 +96,13 @@ tbl.show_num_df(
95
96
  - `plot_histogram()` histogram for one or more **numerical** columns
96
97
  - `plot_joints()` a joint plot for **exactly two numerical** columns
97
98
  - `plot_quadrants()` quickly shows a 2x2 heatmap
99
+ - 🆕 `plot_stacked_bars()` shows stacked bars for a facet value as subplots
100
+ <br>
98
101
 
99
102
  - `ven` offers functions for _venn diagrams_
100
103
  - `show_venn2()` displays a venn diagram for 2 sets
101
104
  - `show_venn3()` displays a venn diagram for 3 sets
105
+ <br>
102
106
 
103
107
  - `hlp` contains some (variety) helper functions
104
108
  - `to_series()` converts a dataframe to a series (`🚨 breaking change`)
@@ -110,6 +114,7 @@ tbl.show_num_df(
110
114
  - `show_package_version` prints version of a list of packages
111
115
  - `get_os` helps to identify and ensure operating system at runtime
112
116
  - `🆕 add_bitmask_label()` adds a column to the data that resolves a bitmask column into human-readable labels
117
+ <br>
113
118
 
114
119
  - `pii` has routines for handling of personally identifiable information
115
120
  - `remove_pii()` logs and deletes pii from a series
File without changes