pandas-plots 0.12.1__tar.gz → 0.12.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pandas_plots-0.12.1/src/pandas_plots.egg-info → pandas_plots-0.12.3}/PKG-INFO +1 -1
- {pandas_plots-0.12.1 → pandas_plots-0.12.3}/setup.cfg +1 -1
- {pandas_plots-0.12.1 → pandas_plots-0.12.3}/src/pandas_plots/pls.py +122 -224
- {pandas_plots-0.12.1 → pandas_plots-0.12.3}/src/pandas_plots/tbl.py +8 -7
- {pandas_plots-0.12.1 → pandas_plots-0.12.3/src/pandas_plots.egg-info}/PKG-INFO +1 -1
- {pandas_plots-0.12.1 → pandas_plots-0.12.3}/LICENSE +0 -0
- {pandas_plots-0.12.1 → pandas_plots-0.12.3}/README.md +0 -0
- {pandas_plots-0.12.1 → pandas_plots-0.12.3}/pyproject.toml +0 -0
- {pandas_plots-0.12.1 → pandas_plots-0.12.3}/src/pandas_plots/hlp.py +0 -0
- {pandas_plots-0.12.1 → pandas_plots-0.12.3}/src/pandas_plots/pii.py +0 -0
- {pandas_plots-0.12.1 → pandas_plots-0.12.3}/src/pandas_plots/ven.py +0 -0
- {pandas_plots-0.12.1 → pandas_plots-0.12.3}/src/pandas_plots.egg-info/SOURCES.txt +0 -0
- {pandas_plots-0.12.1 → pandas_plots-0.12.3}/src/pandas_plots.egg-info/dependency_links.txt +0 -0
- {pandas_plots-0.12.1 → pandas_plots-0.12.3}/src/pandas_plots.egg-info/requires.txt +0 -0
- {pandas_plots-0.12.1 → pandas_plots-0.12.3}/src/pandas_plots.egg-info/top_level.txt +0 -0
@@ -16,11 +16,73 @@ from plotly.subplots import make_subplots
|
|
16
16
|
from .hlp import *
|
17
17
|
from .tbl import print_summary
|
18
18
|
|
19
|
+
### helper functions
|
19
20
|
|
20
21
|
def _set_caption(caption: str) -> str:
|
21
22
|
return f"#️⃣{'-'.join(caption.split())}, " if caption else ""
|
22
23
|
|
23
24
|
|
25
|
+
def aggregate_data(df: pd.DataFrame, top_n_index: int, top_n_columns: int, top_n_facet: int, null_label: str) -> pd.DataFrame:
|
26
|
+
"""
|
27
|
+
Aggregates the data, ensuring each combination of 'index', 'col', and 'facet' is unique with summed 'value'.
|
28
|
+
|
29
|
+
Args:
|
30
|
+
df (pd.DataFrame): Input DataFrame.
|
31
|
+
top_n_index (int): top N values of the first column to keep. 0 means take all.
|
32
|
+
top_n_columns (int): top N values of the second column to keep. 0 means take all.
|
33
|
+
top_n_facet (int): top N values of the third column to keep. 0 means take all.
|
34
|
+
null_label (str): Label for null values.
|
35
|
+
|
36
|
+
Returns:
|
37
|
+
pd.DataFrame: Aggregated and filtered dataset.
|
38
|
+
"""
|
39
|
+
for col in ['index', 'col', 'facet']: # Skip 'value' column (numeric)
|
40
|
+
df[col] = df[col].fillna(null_label)
|
41
|
+
|
42
|
+
# Aggregate data to ensure unique combinations
|
43
|
+
aggregated_df = df.groupby(['index', 'col', 'facet'], as_index=False)['value'].sum()
|
44
|
+
|
45
|
+
# Reduce data based on top_n parameters
|
46
|
+
if top_n_index > 0:
|
47
|
+
top_indexes = aggregated_df.groupby('index')['value'].sum().nlargest(top_n_index).index
|
48
|
+
aggregated_df = aggregated_df[aggregated_df['index'].isin(top_indexes)]
|
49
|
+
if top_n_columns > 0:
|
50
|
+
top_columns = aggregated_df.groupby('col')['value'].sum().nlargest(top_n_columns).index
|
51
|
+
aggregated_df = aggregated_df[aggregated_df['col'].isin(top_columns)]
|
52
|
+
if top_n_facet > 0:
|
53
|
+
top_facets = aggregated_df.groupby('facet')['value'].sum().nlargest(top_n_facet).index
|
54
|
+
aggregated_df = aggregated_df[aggregated_df['facet'].isin(top_facets)]
|
55
|
+
|
56
|
+
# Ensure facets are sorted alphabetically
|
57
|
+
aggregated_df['facet'] = pd.Categorical(aggregated_df['facet'], sorted(aggregated_df['facet'].unique()))
|
58
|
+
aggregated_df = aggregated_df.sort_values(by='facet')
|
59
|
+
|
60
|
+
return aggregated_df
|
61
|
+
|
62
|
+
|
63
|
+
def assign_column_colors(columns, color_palette, null_label):
|
64
|
+
"""
|
65
|
+
Assigns colors to columns, with a special gray color for null values.
|
66
|
+
|
67
|
+
Args:
|
68
|
+
columns (list): List of column values.
|
69
|
+
color_palette (str): Name of the color palette.
|
70
|
+
null_label (str): Label for null values.
|
71
|
+
|
72
|
+
Returns:
|
73
|
+
dict: Mapping of column values to colors.
|
74
|
+
"""
|
75
|
+
if hasattr(px.colors.qualitative, color_palette):
|
76
|
+
palette = getattr(px.colors.qualitative, color_palette)
|
77
|
+
else:
|
78
|
+
raise ValueError(f"Invalid color palette: {color_palette}")
|
79
|
+
|
80
|
+
colors = {col: palette[i % len(palette)] for i, col in enumerate(sorted(columns))}
|
81
|
+
colors[null_label] = "gray"
|
82
|
+
return colors
|
83
|
+
|
84
|
+
### main functions
|
85
|
+
|
24
86
|
def plot_quadrants(
|
25
87
|
df: pd.DataFrame,
|
26
88
|
title: str = None,
|
@@ -125,35 +187,17 @@ def plot_stacked_bars(
|
|
125
187
|
show_total: bool = False,
|
126
188
|
precision: int = 0,
|
127
189
|
png_path: Path | str = None,
|
190
|
+
color_palette: str = "Plotly",
|
191
|
+
null_label: str = "<NA>",
|
128
192
|
) -> object:
|
129
193
|
"""
|
130
194
|
Generates a stacked bar plot using the provided DataFrame.
|
131
|
-
|
132
|
-
- index axis
|
133
|
-
- color axis (legend)
|
134
|
-
- values (optional, if absent a simple count is applied)
|
195
|
+
Updated to assign colors using `assign_column_colors` with nulls colored grey.
|
135
196
|
|
136
197
|
Parameters:
|
137
|
-
|
138
|
-
-
|
139
|
-
-
|
140
|
-
- dropna: bool = False - Whether to include NULL values in the plot.
|
141
|
-
- swap: bool = False - Whether to swap the x-axis and y-axis.
|
142
|
-
- normalize: bool = False - Whether to normalize the values.
|
143
|
-
- relative: bool = False - Whether to show relative values as bars instead of absolute.
|
144
|
-
- orientation: Literal["h", "v"] = "v" - The orientation of the plot.
|
145
|
-
- height: int = 500 - The height of the plot.
|
146
|
-
- width: An optional integer indicating the width of the chart. Default is 2000.
|
147
|
-
- title: str = None - The title of the plot.
|
148
|
-
- renderer: Literal["png", "svg",None] = "png" - The renderer for the plot.
|
149
|
-
- caption: An optional string indicating the caption for the chart.
|
150
|
-
- sort_values: bool = False - Sort axis by index (default) or values
|
151
|
-
- show_total: bool = False - Whether to show the total value
|
152
|
-
- precision: int = 0 - The number of decimal places to round to
|
153
|
-
- png_path (Path | str, optional): The path to save the image as a png file. Defaults to None.
|
154
|
-
|
155
|
-
Returns:
|
156
|
-
plot object
|
198
|
+
All parameters are similar to the original function, with the addition of:
|
199
|
+
- color_palette: str - Name of the color palette.
|
200
|
+
- null_label: str - Label for null values.
|
157
201
|
"""
|
158
202
|
BAR_LENGTH_MULTIPLIER = 1.05
|
159
203
|
|
@@ -171,9 +215,9 @@ def plot_stacked_bars(
|
|
171
215
|
if len(df.columns) == 2:
|
172
216
|
df["cnt"] = 1
|
173
217
|
|
174
|
-
# *
|
218
|
+
# * handle null values
|
175
219
|
if not dropna:
|
176
|
-
df = df.fillna(
|
220
|
+
df = df.fillna(null_label)
|
177
221
|
else:
|
178
222
|
df.dropna(inplace=True)
|
179
223
|
|
@@ -190,6 +234,10 @@ def plot_stacked_bars(
|
|
190
234
|
col_index = df.columns[0] if not swap else df.columns[1]
|
191
235
|
col_color = df.columns[1] if not swap else df.columns[0]
|
192
236
|
|
237
|
+
# * assign colors to columns
|
238
|
+
unique_colors = sorted(df[col_color].unique())
|
239
|
+
column_colors = assign_column_colors(unique_colors, color_palette, null_label)
|
240
|
+
|
193
241
|
# * add total as aggregation of df
|
194
242
|
if show_total:
|
195
243
|
df_total = df.copy()
|
@@ -198,12 +246,8 @@ def plot_stacked_bars(
|
|
198
246
|
|
199
247
|
# * ensure df is grouped to prevent false aggregations
|
200
248
|
df = (
|
201
|
-
df.groupby(
|
202
|
-
|
203
|
-
df.columns[0],
|
204
|
-
df.columns[1],
|
205
|
-
]
|
206
|
-
)[df.columns[2]]
|
249
|
+
df.groupby([df.columns[0], df.columns[1]])
|
250
|
+
[df.columns[2]]
|
207
251
|
.sum()
|
208
252
|
.reset_index()
|
209
253
|
)
|
@@ -212,135 +256,43 @@ def plot_stacked_bars(
|
|
212
256
|
divider = 2 if show_total else 1
|
213
257
|
n = int(df[df.columns[2]].sum() / divider)
|
214
258
|
|
215
|
-
# *
|
216
|
-
df["cnt_pct"] = df[df.columns[2]] / n # * col[3]
|
217
|
-
df["cnt_str"] = df[df.columns[2]].apply(lambda x: f"{x:_}") # * col[4]
|
218
|
-
df["cnt_pct_str"] = df["cnt_pct"].apply(lambda x: f"{x:.2%}") # * col[5]
|
219
|
-
|
220
|
-
# * now set calculated col
|
221
|
-
col_value = df.columns[2] if not normalize else df.columns[3]
|
222
|
-
col_value_str = df.columns[4] if not normalize else df.columns[5]
|
223
|
-
|
224
|
-
if top_n_index > 0:
|
225
|
-
# * get top n -> series
|
226
|
-
# * on pivot tables (all cells are values) you can also use sum for each column[df.sum(axis=1) > n]
|
227
|
-
ser_top_n = (
|
228
|
-
df.groupby(col_index)[col_value]
|
229
|
-
.sum()
|
230
|
-
.sort_values(ascending=False)[:top_n_index]
|
231
|
-
)
|
232
|
-
# * only process top n indexes. this does not change pct values
|
233
|
-
df = df[df[col_index].isin(ser_top_n.index)]
|
234
|
-
|
235
|
-
if top_n_color > 0:
|
236
|
-
# * get top n -> series
|
237
|
-
# * on pivot tables (all cells are values) you can also use sum for each column[df.sum(axis=1) > n]
|
238
|
-
ser_top_n_col = (
|
239
|
-
df.groupby(col_color)[col_value]
|
240
|
-
.sum()
|
241
|
-
.sort_values(ascending=False)[:top_n_color]
|
242
|
-
)
|
243
|
-
# * only process top n colors. this does not change pct values
|
244
|
-
df = df[df[col_color].isin(ser_top_n_col.index)]
|
245
|
-
|
246
|
-
# * get longest bar
|
247
|
-
bar_max = (
|
248
|
-
df.groupby(col_index)[col_value].sum().sort_values(ascending=False).iloc[0]
|
249
|
-
* BAR_LENGTH_MULTIPLIER
|
250
|
-
)
|
251
|
-
|
252
|
-
# * are TOP n selected? include in default title
|
259
|
+
# * title str
|
253
260
|
_title_str_top_index = f"TOP{top_n_index} " if top_n_index > 0 else ""
|
254
261
|
_title_str_top_color = f"TOP{top_n_color} " if top_n_color > 0 else ""
|
255
|
-
|
256
|
-
# * title str na
|
257
262
|
_title_str_null = f", NULL excluded" if dropna else ""
|
258
|
-
|
259
|
-
# * title str n
|
260
263
|
_title_str_n = f", n={n:_}"
|
261
|
-
|
262
264
|
caption = _set_caption(caption)
|
263
265
|
|
264
266
|
# * plot
|
265
267
|
_fig = px.bar(
|
266
268
|
df,
|
267
|
-
x=col_index if orientation == "v" else
|
268
|
-
y=
|
269
|
+
x=col_index if orientation == "v" else df.columns[2],
|
270
|
+
y=df.columns[2] if orientation == "v" else col_index,
|
269
271
|
color=col_color,
|
270
|
-
text=
|
271
|
-
# barmode="stack",
|
272
|
+
text=df.columns[2],
|
272
273
|
orientation=orientation,
|
273
274
|
title=title
|
274
275
|
or f"{caption}{_title_str_top_index}[{col_index}] by {_title_str_top_color}[{col_color}]{_title_str_null}{_title_str_n}",
|
275
|
-
# * retrieve theme from env (intro.set_theme) or default
|
276
276
|
template="plotly_dark" if os.getenv("THEME") == "dark" else "plotly",
|
277
277
|
width=width,
|
278
278
|
height=height,
|
279
|
-
|
280
|
-
|
281
|
-
# * ignore if bar mode is on
|
282
|
-
if not relative:
|
283
|
-
if orientation == "v":
|
284
|
-
_fig.update_yaxes(range=[0, bar_max])
|
285
|
-
else:
|
286
|
-
_fig.update_xaxes(range=[0, bar_max])
|
287
|
-
else:
|
288
|
-
_fig.update_layout(barnorm="percent")
|
289
|
-
|
290
|
-
# * set title properties
|
291
|
-
_fig.update_layout(
|
292
|
-
title={
|
293
|
-
# 'x': 0.1,
|
294
|
-
"y": 0.95,
|
295
|
-
"xanchor": "left",
|
296
|
-
"yanchor": "top",
|
297
|
-
"font": {
|
298
|
-
"size": 24,
|
299
|
-
},
|
300
|
-
},
|
279
|
+
color_discrete_map=column_colors, # Use assigned colors
|
301
280
|
)
|
302
281
|
|
303
282
|
# * show grids, set to smaller distance on pct scale
|
304
|
-
_fig.update_xaxes(
|
305
|
-
|
306
|
-
gridwidth=1,
|
307
|
-
)
|
308
|
-
_fig.update_yaxes(
|
309
|
-
showgrid=True,
|
310
|
-
gridwidth=1,
|
311
|
-
)
|
312
|
-
|
313
|
-
# * set dtick
|
314
|
-
if orientation == "h":
|
315
|
-
if relative:
|
316
|
-
_fig.update_xaxes(dtick=5)
|
317
|
-
elif normalize:
|
318
|
-
_fig.update_xaxes(dtick=0.05)
|
319
|
-
else:
|
320
|
-
if relative:
|
321
|
-
_fig.update_yaxes(dtick=5)
|
322
|
-
elif normalize:
|
323
|
-
_fig.update_yaxes(dtick=0.05)
|
324
|
-
|
325
|
-
# * sorting is in a weird spot, do a 1:1 matrix
|
326
|
-
if orientation == "v" and sort_values:
|
327
|
-
_fig.update_layout(xaxis={"categoryorder": "total descending"})
|
328
|
-
elif orientation == "v" and not sort_values:
|
329
|
-
_fig.update_layout(xaxis={"categoryorder": "category ascending"})
|
330
|
-
elif orientation == "h" and sort_values:
|
331
|
-
_fig.update_layout(yaxis={"categoryorder": "total ascending"})
|
332
|
-
elif orientation == "h" and not sort_values:
|
333
|
-
_fig.update_layout(yaxis={"categoryorder": "category descending"})
|
334
|
-
|
335
|
-
_fig.show(renderer)
|
283
|
+
_fig.update_xaxes(showgrid=True, gridwidth=1)
|
284
|
+
_fig.update_yaxes(showgrid=True, gridwidth=1)
|
336
285
|
|
337
286
|
# * save to png if path is provided
|
338
287
|
if png_path is not None:
|
339
288
|
_fig.write_image(Path(png_path).as_posix())
|
340
289
|
|
290
|
+
_fig.show(renderer)
|
291
|
+
|
341
292
|
return _fig
|
342
293
|
|
343
294
|
|
295
|
+
|
344
296
|
def plot_bars(
|
345
297
|
df_in: pd.Series | pd.DataFrame,
|
346
298
|
caption: str = None,
|
@@ -1094,72 +1046,12 @@ def plot_boxes(
|
|
1094
1046
|
return fig
|
1095
1047
|
|
1096
1048
|
|
1097
|
-
def aggregate_data(df: pd.DataFrame, top_n_index: int, top_n_category: int, top_n_facet: int, null_label: str) -> pd.DataFrame:
|
1098
|
-
"""
|
1099
|
-
Aggregates the data, ensuring each combination of 'index', 'col', and 'facet' is unique with summed 'value'.
|
1100
|
-
|
1101
|
-
Args:
|
1102
|
-
df (pd.DataFrame): Input DataFrame.
|
1103
|
-
top_n_index (int): Top N values of the first column to keep. 0 means take all.
|
1104
|
-
top_n_category (int): Top N values of the second column to keep. 0 means take all.
|
1105
|
-
top_n_facet (int): Top N values of the third column to keep. 0 means take all.
|
1106
|
-
null_label (str): Label for null values.
|
1107
|
-
|
1108
|
-
Returns:
|
1109
|
-
pd.DataFrame: Aggregated and filtered dataset.
|
1110
|
-
"""
|
1111
|
-
# Replace nulls with a placeholder for consistent handling
|
1112
|
-
for col in ['index', 'col', 'facet']: # Skip 'value' column (numeric)
|
1113
|
-
df[col] = df[col].fillna(null_label)
|
1114
|
-
|
1115
|
-
# Aggregate data to ensure unique combinations
|
1116
|
-
aggregated_df = df.groupby(['index', 'col', 'facet'], as_index=False)['value'].sum()
|
1117
|
-
|
1118
|
-
# Reduce data based on top_n parameters
|
1119
|
-
if top_n_index > 0:
|
1120
|
-
top_indexes = aggregated_df.groupby('index')['value'].sum().nlargest(top_n_index).index
|
1121
|
-
aggregated_df = aggregated_df[aggregated_df['index'].isin(top_indexes)]
|
1122
|
-
if top_n_category > 0:
|
1123
|
-
top_categories = aggregated_df.groupby('col')['value'].sum().nlargest(top_n_category).index
|
1124
|
-
aggregated_df = aggregated_df[aggregated_df['col'].isin(top_categories)]
|
1125
|
-
if top_n_facet > 0:
|
1126
|
-
top_facets = aggregated_df.groupby('facet')['value'].sum().nlargest(top_n_facet).index
|
1127
|
-
aggregated_df = aggregated_df[aggregated_df['facet'].isin(top_facets)]
|
1128
|
-
|
1129
|
-
return aggregated_df
|
1130
|
-
|
1131
|
-
|
1132
|
-
def assign_column_colors(columns: pd.Series, color_palette: str, null_label: str) -> dict:
|
1133
|
-
"""
|
1134
|
-
Assign colors to columns using the selected color palette and handle null columns separately.
|
1135
|
-
|
1136
|
-
Args:
|
1137
|
-
columns (pd.Series): The unique column categories.
|
1138
|
-
color_palette (str): The name of the color palette.
|
1139
|
-
null_label (str): The label to be used for null values.
|
1140
|
-
|
1141
|
-
Returns:
|
1142
|
-
dict: Mapping of column values to colors.
|
1143
|
-
"""
|
1144
|
-
if hasattr(px.colors.qualitative, color_palette):
|
1145
|
-
color_scale = px.colors.qualitative.__dict__.get(color_palette, px.colors.qualitative.Plotly)
|
1146
|
-
else:
|
1147
|
-
color_scale = px.colors.sequential.__dict__.get(color_palette, px.colors.sequential.Viridis)
|
1148
|
-
|
1149
|
-
column_colors = {
|
1150
|
-
column: color_scale[i % len(color_scale)]
|
1151
|
-
for i, column in enumerate(columns) if column != null_label
|
1152
|
-
}
|
1153
|
-
column_colors[null_label] = "gray" # Assign gray to null columns
|
1154
|
-
|
1155
|
-
return column_colors
|
1156
|
-
|
1157
1049
|
|
1158
1050
|
def plot_facet_stacked_bars(
|
1159
1051
|
df: pd.DataFrame,
|
1160
1052
|
subplots_per_row: int = 4,
|
1161
1053
|
top_n_index: int = 0,
|
1162
|
-
|
1054
|
+
top_n_columns: int = 0,
|
1163
1055
|
top_n_facet: int = 0,
|
1164
1056
|
null_label: str = "<NA>",
|
1165
1057
|
subplot_size: int = 300,
|
@@ -1169,16 +1061,16 @@ def plot_facet_stacked_bars(
|
|
1169
1061
|
annotations: bool = False,
|
1170
1062
|
precision: int = 0,
|
1171
1063
|
png_path: Optional[Path] = None,
|
1172
|
-
) ->
|
1064
|
+
) -> object:
|
1173
1065
|
"""
|
1174
1066
|
Create a grid of stacked bar charts.
|
1175
1067
|
|
1176
1068
|
Args:
|
1177
1069
|
df (pd.DataFrame): DataFrame with 3 or 4 columns.
|
1178
1070
|
subplots_per_row (int): Number of subplots per row.
|
1179
|
-
top_n_index (int):
|
1180
|
-
|
1181
|
-
top_n_facet (int):
|
1071
|
+
top_n_index (int): top N index values to keep.
|
1072
|
+
top_n_columns (int): top N column values to keep.
|
1073
|
+
top_n_facet (int): top N facet values to keep.
|
1182
1074
|
null_label (str): Label for null values.
|
1183
1075
|
subplot_size (int): Size of each subplot.
|
1184
1076
|
color_palette (str): Name of the color palette.
|
@@ -1189,41 +1081,39 @@ def plot_facet_stacked_bars(
|
|
1189
1081
|
png_path (Optional[Path]): Path to save the image.
|
1190
1082
|
|
1191
1083
|
Returns:
|
1192
|
-
|
1084
|
+
plot object
|
1085
|
+
|
1086
|
+
Remarks:
|
1087
|
+
If you need to include facets that have no data, fill up like this beforehand:
|
1088
|
+
df.loc[len(df)]=[None, None, 12]
|
1193
1089
|
"""
|
1194
|
-
|
1090
|
+
|
1091
|
+
df = df.copy() # Copy the input DataFrame to avoid modifying the original
|
1092
|
+
|
1195
1093
|
if not (df.shape[1] == 3 or df.shape[1] == 4):
|
1196
1094
|
raise ValueError("Input DataFrame must have 3 or 4 columns.")
|
1197
1095
|
|
1198
|
-
# Store original column names
|
1199
1096
|
original_column_names = df.columns.tolist()
|
1200
1097
|
|
1201
|
-
# Rename columns to more concise names
|
1202
1098
|
if df.shape[1] == 3:
|
1203
1099
|
df.columns = ['index', 'col', 'facet']
|
1204
|
-
df['value'] = 1
|
1100
|
+
df['value'] = 1
|
1205
1101
|
elif df.shape[1] == 4:
|
1206
1102
|
df.columns = ['index', 'col', 'facet', 'value']
|
1207
1103
|
|
1208
|
-
|
1209
|
-
aggregated_df = aggregate_data(df, top_n_index, top_n_category, top_n_facet, null_label)
|
1104
|
+
aggregated_df = aggregate_data(df, top_n_index, top_n_columns, top_n_facet, null_label)
|
1210
1105
|
|
1211
|
-
# Get unique facets and columns
|
1212
1106
|
facets = aggregated_df['facet'].unique()
|
1213
|
-
columns = aggregated_df['col'].unique()
|
1214
|
-
|
1215
|
-
# Assign colors to columns
|
1107
|
+
columns = sorted(aggregated_df['col'].unique())
|
1216
1108
|
column_colors = assign_column_colors(columns, color_palette, null_label)
|
1217
1109
|
|
1218
|
-
# Create subplot grid
|
1219
1110
|
fig = make_subplots(
|
1220
|
-
rows=-(-len(facets) // subplots_per_row),
|
1111
|
+
rows=-(-len(facets) // subplots_per_row),
|
1221
1112
|
cols=min(subplots_per_row, len(facets)),
|
1222
1113
|
subplot_titles=facets,
|
1223
1114
|
)
|
1224
1115
|
|
1225
|
-
|
1226
|
-
added_to_legend = set() # Track which columns have been added to the legend
|
1116
|
+
added_to_legend = set()
|
1227
1117
|
for i, facet in enumerate(facets):
|
1228
1118
|
facet_data = aggregated_df[aggregated_df['facet'] == facet]
|
1229
1119
|
row = (i // subplots_per_row) + 1
|
@@ -1247,7 +1137,6 @@ def plot_facet_stacked_bars(
|
|
1247
1137
|
col=col,
|
1248
1138
|
)
|
1249
1139
|
|
1250
|
-
# Add annotations if annotations is True
|
1251
1140
|
if annotations:
|
1252
1141
|
for _, row_data in column_data.iterrows():
|
1253
1142
|
fig.add_annotation(
|
@@ -1259,29 +1148,38 @@ def plot_facet_stacked_bars(
|
|
1259
1148
|
col=col,
|
1260
1149
|
)
|
1261
1150
|
|
1262
|
-
# Create the dynamic title
|
1263
1151
|
unique_rows = len(aggregated_df)
|
1264
|
-
|
1152
|
+
axis_details = []
|
1153
|
+
if top_n_index > 0:
|
1154
|
+
axis_details.append(f"top {top_n_index} [{original_column_names[0]}]")
|
1155
|
+
else:
|
1156
|
+
axis_details.append(f"[{original_column_names[0]}]")
|
1157
|
+
|
1158
|
+
if top_n_columns > 0:
|
1159
|
+
axis_details.append(f"top {top_n_columns} [{original_column_names[1]}]")
|
1160
|
+
else:
|
1161
|
+
axis_details.append(f"[{original_column_names[1]}]")
|
1162
|
+
|
1163
|
+
if top_n_facet > 0:
|
1164
|
+
axis_details.append(f"top {top_n_facet} [{original_column_names[2]}]")
|
1165
|
+
else:
|
1166
|
+
axis_details.append(f"[{original_column_names[2]}]")
|
1265
1167
|
|
1266
|
-
|
1168
|
+
title = f"{caption} {', '.join(axis_details)}, n = {unique_rows:_}"
|
1267
1169
|
template = "plotly_dark" if os.getenv("THEME") == "dark" else "plotly"
|
1268
1170
|
fig.update_layout(
|
1269
1171
|
title=title,
|
1270
|
-
barmode="stack",
|
1172
|
+
barmode="stack",
|
1271
1173
|
height=subplot_size * (-(-len(facets) // subplots_per_row)),
|
1272
1174
|
width=subplot_size * min(subplots_per_row, len(facets)),
|
1273
1175
|
showlegend=True,
|
1274
1176
|
template=template,
|
1275
1177
|
)
|
1276
1178
|
|
1277
|
-
# Save the figure if png_path is specified
|
1278
1179
|
if png_path:
|
1279
1180
|
png_path = Path(png_path)
|
1280
1181
|
fig.write_image(str(png_path))
|
1281
1182
|
|
1282
|
-
# Show the figure with the renderer specified
|
1283
1183
|
fig.show(renderer)
|
1284
1184
|
|
1285
|
-
|
1286
|
-
return aggregated_df
|
1287
|
-
|
1185
|
+
return fig
|
@@ -162,12 +162,7 @@ def describe_df(
|
|
162
162
|
# * only show numerics
|
163
163
|
for col in df.select_dtypes("number").columns:
|
164
164
|
_u, _h = get_uniques_header(col)
|
165
|
-
|
166
|
-
# * extra care for scipy metrics, these are very vulnarable to nan
|
167
|
-
# print(
|
168
|
-
# f"{_h} min: {round(df[col].min(),3):_} | max: {round(df[col].max(),3):_} | median: {round(df[col].median(),3):_} | mean: {round(df[col].mean(),3):_} | std: {round(df[col].std(),3):_} | cv: {round(df[col].std() / df[col].mean(),3):_} | sum: {round(df[col].sum(),3):_} | skew: {round(stats.skew(df[col].dropna().tolist()),3)} | kurto: {round(stats.kurtosis(df[col].dropna().tolist()),3)}"
|
169
|
-
# )
|
170
|
-
print_summary(df[col], _h)
|
165
|
+
print_summary(df=df[col], name=_h)
|
171
166
|
|
172
167
|
# * show first 3 rows
|
173
168
|
display(df[:3])
|
@@ -710,6 +705,8 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
|
|
710
705
|
75th percentile (Q3), upper bound, maximum, standard deviation, coefficient of variation,
|
711
706
|
sum, skewness, and kurtosis. The interquartile range (IQR) is used to compute the lower
|
712
707
|
and upper bounds, which are adjusted not to exceed the min and max of the data.
|
708
|
+
|
709
|
+
df is being dropna() beforehand to ensure scipy results
|
713
710
|
|
714
711
|
Args:
|
715
712
|
df (Union[pd.DataFrame, pd.Series]): Input DataFrame or Series. Only numeric columns
|
@@ -720,12 +717,15 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
|
|
720
717
|
if df.empty:
|
721
718
|
return
|
722
719
|
|
720
|
+
# * drop NA to keep scipy sane
|
721
|
+
df = df.dropna().copy()
|
722
|
+
|
723
723
|
def print_summary_ser(ser: pd.Series, show: bool=True, name: str=""):
|
724
724
|
# Calculate IQR and pass `rng=(25, 75)` to get the interquartile range
|
725
725
|
iqr_value = stats.iqr(ser)
|
726
726
|
|
727
727
|
# * drop NA to keep scipy sane
|
728
|
-
ser.dropna(inplace=True)
|
728
|
+
# ser.dropna(inplace=True)
|
729
729
|
|
730
730
|
# Using the iqr function, we still calculate the bounds manually
|
731
731
|
q1 = stats.scoreatpercentile(ser, 25)
|
@@ -776,4 +776,5 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
|
|
776
776
|
# * only show numerics
|
777
777
|
for col in df.select_dtypes("number").columns:
|
778
778
|
summary = print_summary_ser(ser=df[col],show=show, name=col)
|
779
|
+
|
779
780
|
return summary
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|