pandas-plots 0.11.28__tar.gz → 0.12.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pandas_plots-0.11.28/src/pandas_plots.egg-info → pandas_plots-0.12.1}/PKG-INFO +7 -2
- {pandas_plots-0.11.28 → pandas_plots-0.12.1}/README.md +5 -0
- {pandas_plots-0.11.28 → pandas_plots-0.12.1}/setup.cfg +1 -1
- {pandas_plots-0.11.28 → pandas_plots-0.12.1}/src/pandas_plots/pls.py +194 -50
- {pandas_plots-0.11.28 → pandas_plots-0.12.1}/src/pandas_plots/tbl.py +30 -8
- {pandas_plots-0.11.28 → pandas_plots-0.12.1/src/pandas_plots.egg-info}/PKG-INFO +7 -2
- {pandas_plots-0.11.28 → pandas_plots-0.12.1}/LICENSE +0 -0
- {pandas_plots-0.11.28 → pandas_plots-0.12.1}/pyproject.toml +0 -0
- {pandas_plots-0.11.28 → pandas_plots-0.12.1}/src/pandas_plots/hlp.py +0 -0
- {pandas_plots-0.11.28 → pandas_plots-0.12.1}/src/pandas_plots/pii.py +0 -0
- {pandas_plots-0.11.28 → pandas_plots-0.12.1}/src/pandas_plots/ven.py +0 -0
- {pandas_plots-0.11.28 → pandas_plots-0.12.1}/src/pandas_plots.egg-info/SOURCES.txt +0 -0
- {pandas_plots-0.11.28 → pandas_plots-0.12.1}/src/pandas_plots.egg-info/dependency_links.txt +0 -0
- {pandas_plots-0.11.28 → pandas_plots-0.12.1}/src/pandas_plots.egg-info/requires.txt +0 -0
- {pandas_plots-0.11.28 → pandas_plots-0.12.1}/src/pandas_plots.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.2
|
2
2
|
Name: pandas-plots
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.12.1
|
4
4
|
Summary: A collection of helper for table handling and visualization
|
5
5
|
Home-page: https://github.com/smeisegeier/pandas-plots
|
6
6
|
Author: smeisegeier
|
@@ -85,6 +85,7 @@ tbl.show_num_df(
|
|
85
85
|
- `descr_db()` a very short descr for a `duckdb` relation
|
86
86
|
- `pivot_df()` gets a pivot table of a 3 column dataframe (or 2 columns if no weights are given)
|
87
87
|
- `print_summary()` shows statistics for a pandas DataFrame or Series
|
88
|
+
<br>
|
88
89
|
|
89
90
|
- `pls` for plotly visualizations
|
90
91
|
- `plot_box()` auto annotated boxplot w/ violin option
|
@@ -95,10 +96,13 @@ tbl.show_num_df(
|
|
95
96
|
- `plot_histogram()` histogram for one or more **numerical** columns
|
96
97
|
- `plot_joints()` a joint plot for **exactly two numerical** columns
|
97
98
|
- `plot_quadrants()` quickly shows a 2x2 heatmap
|
99
|
+
- 🆕 `plot_stacked_bars()` shows stacked bars for a facet value as subplots
|
100
|
+
<br>
|
98
101
|
|
99
102
|
- `ven` offers functions for _venn diagrams_
|
100
103
|
- `show_venn2()` displays a venn diagram for 2 sets
|
101
104
|
- `show_venn3()` displays a venn diagram for 3 sets
|
105
|
+
<br>
|
102
106
|
|
103
107
|
- `hlp` contains some (variety) helper functions
|
104
108
|
- `to_series()` converts a dataframe to a series (`🚨 breaking change`)
|
@@ -110,6 +114,7 @@ tbl.show_num_df(
|
|
110
114
|
- `show_package_version` prints version of a list of packages
|
111
115
|
- `get_os` helps to identify and ensure operating system at runtime
|
112
116
|
- `🆕 add_bitmask_label()` adds a column to the data that resolves a bitmask column into human-readable labels
|
117
|
+
<br>
|
113
118
|
|
114
119
|
- `pii` has routines for handling of personally identifiable information
|
115
120
|
- `remove_pii()` logs and deletes pii from a series
|
@@ -50,6 +50,7 @@ tbl.show_num_df(
|
|
50
50
|
- `descr_db()` a very short descr for a `duckdb` relation
|
51
51
|
- `pivot_df()` gets a pivot table of a 3 column dataframe (or 2 columns if no weights are given)
|
52
52
|
- `print_summary()` shows statistics for a pandas DataFrame or Series
|
53
|
+
<br>
|
53
54
|
|
54
55
|
- `pls` for plotly visualizations
|
55
56
|
- `plot_box()` auto annotated boxplot w/ violin option
|
@@ -60,10 +61,13 @@ tbl.show_num_df(
|
|
60
61
|
- `plot_histogram()` histogram for one or more **numerical** columns
|
61
62
|
- `plot_joints()` a joint plot for **exactly two numerical** columns
|
62
63
|
- `plot_quadrants()` quickly shows a 2x2 heatmap
|
64
|
+
- 🆕 `plot_stacked_bars()` shows stacked bars for a facet value as subplots
|
65
|
+
<br>
|
63
66
|
|
64
67
|
- `ven` offers functions for _venn diagrams_
|
65
68
|
- `show_venn2()` displays a venn diagram for 2 sets
|
66
69
|
- `show_venn3()` displays a venn diagram for 3 sets
|
70
|
+
<br>
|
67
71
|
|
68
72
|
- `hlp` contains some (variety) helper functions
|
69
73
|
- `to_series()` converts a dataframe to a series (`🚨 breaking change`)
|
@@ -75,6 +79,7 @@ tbl.show_num_df(
|
|
75
79
|
- `show_package_version` prints version of a list of packages
|
76
80
|
- `get_os` helps to identify and ensure operating system at runtime
|
77
81
|
- `🆕 add_bitmask_label()` adds a column to the data that resolves a bitmask column into human-readable labels
|
82
|
+
<br>
|
78
83
|
|
79
84
|
- `pii` has routines for handling of personally identifiable information
|
80
85
|
- `remove_pii()` logs and deletes pii from a series
|
@@ -4,12 +4,14 @@ import warnings
|
|
4
4
|
warnings.filterwarnings("ignore")
|
5
5
|
|
6
6
|
import os
|
7
|
-
from typing import Literal
|
7
|
+
from typing import Optional, Literal
|
8
8
|
|
9
9
|
import pandas as pd
|
10
10
|
import seaborn as sb
|
11
11
|
from matplotlib import pyplot as plt
|
12
12
|
from plotly import express as px
|
13
|
+
import plotly.graph_objects as go
|
14
|
+
from plotly.subplots import make_subplots
|
13
15
|
|
14
16
|
from .hlp import *
|
15
17
|
from .tbl import print_summary
|
@@ -1092,52 +1094,194 @@ def plot_boxes(
|
|
1092
1094
|
return fig
|
1093
1095
|
|
1094
1096
|
|
1095
|
-
|
1096
|
-
|
1097
|
-
|
1098
|
-
|
1099
|
-
|
1100
|
-
|
1101
|
-
|
1102
|
-
|
1103
|
-
|
1104
|
-
|
1105
|
-
|
1106
|
-
|
1107
|
-
|
1108
|
-
|
1109
|
-
#
|
1110
|
-
|
1111
|
-
|
1112
|
-
|
1113
|
-
#
|
1114
|
-
|
1115
|
-
|
1116
|
-
#
|
1117
|
-
|
1118
|
-
|
1119
|
-
|
1120
|
-
|
1121
|
-
|
1122
|
-
|
1123
|
-
|
1124
|
-
|
1125
|
-
|
1126
|
-
|
1127
|
-
|
1128
|
-
|
1129
|
-
|
1130
|
-
|
1131
|
-
|
1132
|
-
|
1133
|
-
|
1134
|
-
|
1135
|
-
|
1136
|
-
|
1137
|
-
|
1138
|
-
|
1139
|
-
|
1140
|
-
|
1141
|
-
|
1142
|
-
|
1143
|
-
|
1097
|
+
def aggregate_data(df: pd.DataFrame, top_n_index: int, top_n_category: int, top_n_facet: int, null_label: str) -> pd.DataFrame:
|
1098
|
+
"""
|
1099
|
+
Aggregates the data, ensuring each combination of 'index', 'col', and 'facet' is unique with summed 'value'.
|
1100
|
+
|
1101
|
+
Args:
|
1102
|
+
df (pd.DataFrame): Input DataFrame.
|
1103
|
+
top_n_index (int): Top N values of the first column to keep. 0 means take all.
|
1104
|
+
top_n_category (int): Top N values of the second column to keep. 0 means take all.
|
1105
|
+
top_n_facet (int): Top N values of the third column to keep. 0 means take all.
|
1106
|
+
null_label (str): Label for null values.
|
1107
|
+
|
1108
|
+
Returns:
|
1109
|
+
pd.DataFrame: Aggregated and filtered dataset.
|
1110
|
+
"""
|
1111
|
+
# Replace nulls with a placeholder for consistent handling
|
1112
|
+
for col in ['index', 'col', 'facet']: # Skip 'value' column (numeric)
|
1113
|
+
df[col] = df[col].fillna(null_label)
|
1114
|
+
|
1115
|
+
# Aggregate data to ensure unique combinations
|
1116
|
+
aggregated_df = df.groupby(['index', 'col', 'facet'], as_index=False)['value'].sum()
|
1117
|
+
|
1118
|
+
# Reduce data based on top_n parameters
|
1119
|
+
if top_n_index > 0:
|
1120
|
+
top_indexes = aggregated_df.groupby('index')['value'].sum().nlargest(top_n_index).index
|
1121
|
+
aggregated_df = aggregated_df[aggregated_df['index'].isin(top_indexes)]
|
1122
|
+
if top_n_category > 0:
|
1123
|
+
top_categories = aggregated_df.groupby('col')['value'].sum().nlargest(top_n_category).index
|
1124
|
+
aggregated_df = aggregated_df[aggregated_df['col'].isin(top_categories)]
|
1125
|
+
if top_n_facet > 0:
|
1126
|
+
top_facets = aggregated_df.groupby('facet')['value'].sum().nlargest(top_n_facet).index
|
1127
|
+
aggregated_df = aggregated_df[aggregated_df['facet'].isin(top_facets)]
|
1128
|
+
|
1129
|
+
return aggregated_df
|
1130
|
+
|
1131
|
+
|
1132
|
+
def assign_column_colors(columns: pd.Series, color_palette: str, null_label: str) -> dict:
|
1133
|
+
"""
|
1134
|
+
Assign colors to columns using the selected color palette and handle null columns separately.
|
1135
|
+
|
1136
|
+
Args:
|
1137
|
+
columns (pd.Series): The unique column categories.
|
1138
|
+
color_palette (str): The name of the color palette.
|
1139
|
+
null_label (str): The label to be used for null values.
|
1140
|
+
|
1141
|
+
Returns:
|
1142
|
+
dict: Mapping of column values to colors.
|
1143
|
+
"""
|
1144
|
+
if hasattr(px.colors.qualitative, color_palette):
|
1145
|
+
color_scale = px.colors.qualitative.__dict__.get(color_palette, px.colors.qualitative.Plotly)
|
1146
|
+
else:
|
1147
|
+
color_scale = px.colors.sequential.__dict__.get(color_palette, px.colors.sequential.Viridis)
|
1148
|
+
|
1149
|
+
column_colors = {
|
1150
|
+
column: color_scale[i % len(color_scale)]
|
1151
|
+
for i, column in enumerate(columns) if column != null_label
|
1152
|
+
}
|
1153
|
+
column_colors[null_label] = "gray" # Assign gray to null columns
|
1154
|
+
|
1155
|
+
return column_colors
|
1156
|
+
|
1157
|
+
|
1158
|
+
def plot_facet_stacked_bars(
|
1159
|
+
df: pd.DataFrame,
|
1160
|
+
subplots_per_row: int = 4,
|
1161
|
+
top_n_index: int = 0,
|
1162
|
+
top_n_category: int = 0,
|
1163
|
+
top_n_facet: int = 0,
|
1164
|
+
null_label: str = "<NA>",
|
1165
|
+
subplot_size: int = 300,
|
1166
|
+
color_palette: str = "Plotly",
|
1167
|
+
caption: str = "",
|
1168
|
+
renderer: Optional[Literal["png", "svg"]] = "png",
|
1169
|
+
annotations: bool = False,
|
1170
|
+
precision: int = 0,
|
1171
|
+
png_path: Optional[Path] = None,
|
1172
|
+
) -> pd.DataFrame:
|
1173
|
+
"""
|
1174
|
+
Create a grid of stacked bar charts.
|
1175
|
+
|
1176
|
+
Args:
|
1177
|
+
df (pd.DataFrame): DataFrame with 3 or 4 columns.
|
1178
|
+
subplots_per_row (int): Number of subplots per row.
|
1179
|
+
top_n_index (int): Top N index values to keep.
|
1180
|
+
top_n_category (int): Top N category values to keep.
|
1181
|
+
top_n_facet (int): Top N facet values to keep.
|
1182
|
+
null_label (str): Label for null values.
|
1183
|
+
subplot_size (int): Size of each subplot.
|
1184
|
+
color_palette (str): Name of the color palette.
|
1185
|
+
caption (str): Optional caption to prepend to the title.
|
1186
|
+
renderer (Optional[Literal["png", "svg"]]): Renderer for saving the image.
|
1187
|
+
annotations (bool): Whether to show annotations in the subplots.
|
1188
|
+
precision (int): Decimal precision for annotations.
|
1189
|
+
png_path (Optional[Path]): Path to save the image.
|
1190
|
+
|
1191
|
+
Returns:
|
1192
|
+
pd.DataFrame: Aggregated dataset used for plotting.
|
1193
|
+
"""
|
1194
|
+
# Validate input DataFrame
|
1195
|
+
if not (df.shape[1] == 3 or df.shape[1] == 4):
|
1196
|
+
raise ValueError("Input DataFrame must have 3 or 4 columns.")
|
1197
|
+
|
1198
|
+
# Store original column names
|
1199
|
+
original_column_names = df.columns.tolist()
|
1200
|
+
|
1201
|
+
# Rename columns to more concise names
|
1202
|
+
if df.shape[1] == 3:
|
1203
|
+
df.columns = ['index', 'col', 'facet']
|
1204
|
+
df['value'] = 1 # Treat all rows as having a value of 1
|
1205
|
+
elif df.shape[1] == 4:
|
1206
|
+
df.columns = ['index', 'col', 'facet', 'value']
|
1207
|
+
|
1208
|
+
# Aggregate and filter data
|
1209
|
+
aggregated_df = aggregate_data(df, top_n_index, top_n_category, top_n_facet, null_label)
|
1210
|
+
|
1211
|
+
# Get unique facets and columns
|
1212
|
+
facets = aggregated_df['facet'].unique()
|
1213
|
+
columns = aggregated_df['col'].unique()
|
1214
|
+
|
1215
|
+
# Assign colors to columns
|
1216
|
+
column_colors = assign_column_colors(columns, color_palette, null_label)
|
1217
|
+
|
1218
|
+
# Create subplot grid
|
1219
|
+
fig = make_subplots(
|
1220
|
+
rows=-(-len(facets) // subplots_per_row), # Ceiling division
|
1221
|
+
cols=min(subplots_per_row, len(facets)),
|
1222
|
+
subplot_titles=facets,
|
1223
|
+
)
|
1224
|
+
|
1225
|
+
# Add traces for each facet
|
1226
|
+
added_to_legend = set() # Track which columns have been added to the legend
|
1227
|
+
for i, facet in enumerate(facets):
|
1228
|
+
facet_data = aggregated_df[aggregated_df['facet'] == facet]
|
1229
|
+
row = (i // subplots_per_row) + 1
|
1230
|
+
col = (i % subplots_per_row) + 1
|
1231
|
+
|
1232
|
+
for column in columns:
|
1233
|
+
column_data = facet_data[facet_data['col'] == column]
|
1234
|
+
show_legend = column not in added_to_legend
|
1235
|
+
if show_legend:
|
1236
|
+
added_to_legend.add(column)
|
1237
|
+
|
1238
|
+
fig.add_trace(
|
1239
|
+
go.Bar(
|
1240
|
+
x=column_data['index'],
|
1241
|
+
y=column_data['value'],
|
1242
|
+
name=column,
|
1243
|
+
marker=dict(color=column_colors[column]),
|
1244
|
+
showlegend=show_legend,
|
1245
|
+
),
|
1246
|
+
row=row,
|
1247
|
+
col=col,
|
1248
|
+
)
|
1249
|
+
|
1250
|
+
# Add annotations if annotations is True
|
1251
|
+
if annotations:
|
1252
|
+
for _, row_data in column_data.iterrows():
|
1253
|
+
fig.add_annotation(
|
1254
|
+
x=row_data['index'],
|
1255
|
+
y=row_data['value'],
|
1256
|
+
text=f"{row_data['value']:.{precision}f}",
|
1257
|
+
showarrow=False,
|
1258
|
+
row=row,
|
1259
|
+
col=col,
|
1260
|
+
)
|
1261
|
+
|
1262
|
+
# Create the dynamic title
|
1263
|
+
unique_rows = len(aggregated_df)
|
1264
|
+
title = f"{caption} [{original_column_names[0]}] by [{original_column_names[1]}] by [{original_column_names[2]}], n = {unique_rows:_}"
|
1265
|
+
|
1266
|
+
# Update layout for stacking, title, and theme
|
1267
|
+
template = "plotly_dark" if os.getenv("THEME") == "dark" else "plotly"
|
1268
|
+
fig.update_layout(
|
1269
|
+
title=title,
|
1270
|
+
barmode="stack", # Enable stacking
|
1271
|
+
height=subplot_size * (-(-len(facets) // subplots_per_row)),
|
1272
|
+
width=subplot_size * min(subplots_per_row, len(facets)),
|
1273
|
+
showlegend=True,
|
1274
|
+
template=template,
|
1275
|
+
)
|
1276
|
+
|
1277
|
+
# Save the figure if png_path is specified
|
1278
|
+
if png_path:
|
1279
|
+
png_path = Path(png_path)
|
1280
|
+
fig.write_image(str(png_path))
|
1281
|
+
|
1282
|
+
# Show the figure with the renderer specified
|
1283
|
+
fig.show(renderer)
|
1284
|
+
|
1285
|
+
# Return the aggregated dataset
|
1286
|
+
return aggregated_df
|
1287
|
+
|
@@ -699,7 +699,9 @@ def show_num_df(
|
|
699
699
|
|
700
700
|
return out
|
701
701
|
|
702
|
-
|
702
|
+
|
703
|
+
|
704
|
+
def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="🟠 "):
|
703
705
|
"""
|
704
706
|
Print statistical summary for a pandas DataFrame or Series.
|
705
707
|
|
@@ -712,11 +714,13 @@ def print_summary(df: pd.DataFrame | pd.Series, name: str="🟠 "):
|
|
712
714
|
Args:
|
713
715
|
df (Union[pd.DataFrame, pd.Series]): Input DataFrame or Series. Only numeric columns
|
714
716
|
in DataFrame are considered.
|
717
|
+
show (bool, optional): Whether to print the summary. Defaults to True.
|
718
|
+
name (str, optional): Prefix for the summary. Defaults to "🟠 "
|
715
719
|
"""
|
716
720
|
if df.empty:
|
717
721
|
return
|
718
722
|
|
719
|
-
def print_summary_ser(ser: pd.Series, name: str=""):
|
723
|
+
def print_summary_ser(ser: pd.Series, show: bool=True, name: str=""):
|
720
724
|
# Calculate IQR and pass `rng=(25, 75)` to get the interquartile range
|
721
725
|
iqr_value = stats.iqr(ser)
|
722
726
|
|
@@ -744,14 +748,32 @@ def print_summary(df: pd.DataFrame | pd.Series, name: str="🟠 "):
|
|
744
748
|
upper = max if upper > max else upper
|
745
749
|
|
746
750
|
# * extra care for scipy metrics, these are very vulnarable to nan
|
747
|
-
|
748
|
-
|
751
|
+
if show:
|
752
|
+
print(
|
753
|
+
f"""{name} min: {min:_} | lower: {lower:_} | q25: {q1:_} | median: {med:_} | mean: {mean:_} | q75: {q3:_} | upper: {upper:_} | max: {max:_} | std: {std:_} | cv: {cv:_} | sum: {sum:_} | skew: {skew} | kurto: {kurto}""")
|
754
|
+
|
755
|
+
summary = {
|
756
|
+
"min": min,
|
757
|
+
"lower": lower,
|
758
|
+
"q25": q1,
|
759
|
+
"median": med,
|
760
|
+
"mean": mean,
|
761
|
+
"q75": q3,
|
762
|
+
"upper": upper,
|
763
|
+
"max": max,
|
764
|
+
"std": std,
|
765
|
+
"cv": cv,
|
766
|
+
"sum": sum,
|
767
|
+
"skew": skew,
|
768
|
+
"kurto": kurto
|
769
|
+
}
|
770
|
+
return summary
|
749
771
|
|
750
772
|
if isinstance(df, pd.Series):
|
751
|
-
print_summary_ser(df, name)
|
752
|
-
|
773
|
+
return print_summary_ser(df, show=show, name=name)
|
774
|
+
|
753
775
|
if isinstance(df, pd.DataFrame):
|
754
776
|
# * only show numerics
|
755
777
|
for col in df.select_dtypes("number").columns:
|
756
|
-
print_summary_ser(ser=df[col], name=col)
|
757
|
-
return
|
778
|
+
summary = print_summary_ser(ser=df[col],show=show, name=col)
|
779
|
+
return summary
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.2
|
2
2
|
Name: pandas-plots
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.12.1
|
4
4
|
Summary: A collection of helper for table handling and visualization
|
5
5
|
Home-page: https://github.com/smeisegeier/pandas-plots
|
6
6
|
Author: smeisegeier
|
@@ -85,6 +85,7 @@ tbl.show_num_df(
|
|
85
85
|
- `descr_db()` a very short descr for a `duckdb` relation
|
86
86
|
- `pivot_df()` gets a pivot table of a 3 column dataframe (or 2 columns if no weights are given)
|
87
87
|
- `print_summary()` shows statistics for a pandas DataFrame or Series
|
88
|
+
<br>
|
88
89
|
|
89
90
|
- `pls` for plotly visualizations
|
90
91
|
- `plot_box()` auto annotated boxplot w/ violin option
|
@@ -95,10 +96,13 @@ tbl.show_num_df(
|
|
95
96
|
- `plot_histogram()` histogram for one or more **numerical** columns
|
96
97
|
- `plot_joints()` a joint plot for **exactly two numerical** columns
|
97
98
|
- `plot_quadrants()` quickly shows a 2x2 heatmap
|
99
|
+
- 🆕 `plot_stacked_bars()` shows stacked bars for a facet value as subplots
|
100
|
+
<br>
|
98
101
|
|
99
102
|
- `ven` offers functions for _venn diagrams_
|
100
103
|
- `show_venn2()` displays a venn diagram for 2 sets
|
101
104
|
- `show_venn3()` displays a venn diagram for 3 sets
|
105
|
+
<br>
|
102
106
|
|
103
107
|
- `hlp` contains some (variety) helper functions
|
104
108
|
- `to_series()` converts a dataframe to a series (`🚨 breaking change`)
|
@@ -110,6 +114,7 @@ tbl.show_num_df(
|
|
110
114
|
- `show_package_version` prints version of a list of packages
|
111
115
|
- `get_os` helps to identify and ensure operating system at runtime
|
112
116
|
- `🆕 add_bitmask_label()` adds a column to the data that resolves a bitmask column into human-readable labels
|
117
|
+
<br>
|
113
118
|
|
114
119
|
- `pii` has routines for handling of personally identifiable information
|
115
120
|
- `remove_pii()` logs and deletes pii from a series
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|