pandas-plots 0.12.4__py3-none-any.whl → 0.12.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pandas_plots/pls.py +103 -39
- {pandas_plots-0.12.4.dist-info → pandas_plots-0.12.6.dist-info}/METADATA +3 -3
- pandas_plots-0.12.6.dist-info/RECORD +11 -0
- pandas_plots-0.12.6.dist-info/pii.py +76 -0
- pandas_plots-0.12.4.dist-info/RECORD +0 -10
- {pandas_plots-0.12.4.dist-info → pandas_plots-0.12.6.dist-info}/LICENSE +0 -0
- {pandas_plots-0.12.4.dist-info → pandas_plots-0.12.6.dist-info}/WHEEL +0 -0
- {pandas_plots-0.12.4.dist-info → pandas_plots-0.12.6.dist-info}/top_level.txt +0 -0
pandas_plots/pls.py
CHANGED
@@ -78,7 +78,7 @@ def assign_column_colors(columns, color_palette, null_label):
|
|
78
78
|
raise ValueError(f"Invalid color palette: {color_palette}")
|
79
79
|
|
80
80
|
colors = {col: palette[i % len(palette)] for i, col in enumerate(sorted(columns))}
|
81
|
-
colors[null_label] = "
|
81
|
+
colors[null_label] = "lightgray"
|
82
82
|
return colors
|
83
83
|
|
84
84
|
### main functions
|
@@ -192,12 +192,32 @@ def plot_stacked_bars(
|
|
192
192
|
) -> object:
|
193
193
|
"""
|
194
194
|
Generates a stacked bar plot using the provided DataFrame.
|
195
|
-
Updated to assign colors using `assign_column_colors` with nulls colored grey.
|
196
195
|
|
197
196
|
Parameters:
|
198
|
-
|
199
|
-
-
|
200
|
-
-
|
197
|
+
- df (pd.DataFrame): The input DataFrame with at least two categorical columns and one numerical column.
|
198
|
+
- top_n_index (int): Limit the number of categories displayed on the index axis.
|
199
|
+
- top_n_color (int): Limit the number of categories displayed in the color legend.
|
200
|
+
- dropna (bool): If True, removes rows with missing values; otherwise, replaces them with `null_label`.
|
201
|
+
- swap (bool): If True, swaps the first two columns.
|
202
|
+
- normalize (bool): If True, normalizes numerical values between 0 and 1.
|
203
|
+
- relative (bool): If True, normalizes the bars to a percentage scale.
|
204
|
+
- orientation (Literal["h", "v"]): Defines the orientation of the bars ("v" for vertical, "h" for horizontal).
|
205
|
+
- height (int): Height of the plot.
|
206
|
+
- width (int): Width of the plot.
|
207
|
+
- title (str): Custom title for the plot.
|
208
|
+
- renderer (Literal["png", "svg", None]): Defines the output format.
|
209
|
+
- caption (str): Optional caption for additional context.
|
210
|
+
- sort_values (bool):
|
211
|
+
- If True, sorts bars by the sum of their values (descending).
|
212
|
+
- If False, sorts bars alphabetically.
|
213
|
+
- show_total (bool): If True, adds a row with the total sum of all categories.
|
214
|
+
- precision (int): Number of decimal places for numerical values.
|
215
|
+
- png_path (Path | str): If specified, saves the plot as a PNG file.
|
216
|
+
- color_palette (str): Name of the color palette to use.
|
217
|
+
- null_label (str): Label for null values.
|
218
|
+
|
219
|
+
Returns:
|
220
|
+
- A Plotly figure object representing the stacked bar chart.
|
201
221
|
"""
|
202
222
|
BAR_LENGTH_MULTIPLIER = 1.05
|
203
223
|
|
@@ -211,6 +231,8 @@ def plot_stacked_bars(
|
|
211
231
|
print("❌ first 2 columns must be str")
|
212
232
|
return
|
213
233
|
|
234
|
+
df = df.copy() # Copy the input DataFrame to avoid modifying the original
|
235
|
+
|
214
236
|
# * add count column[2] as a service if none is present
|
215
237
|
if len(df.columns) == 2:
|
216
238
|
df["cnt"] = 1
|
@@ -234,16 +256,6 @@ def plot_stacked_bars(
|
|
234
256
|
col_index = df.columns[0] if not swap else df.columns[1]
|
235
257
|
col_color = df.columns[1] if not swap else df.columns[0]
|
236
258
|
|
237
|
-
# * assign colors to columns
|
238
|
-
unique_colors = sorted(df[col_color].unique())
|
239
|
-
column_colors = assign_column_colors(unique_colors, color_palette, null_label)
|
240
|
-
|
241
|
-
# * add total as aggregation of df
|
242
|
-
if show_total:
|
243
|
-
df_total = df.copy()
|
244
|
-
df_total[col_index] = " TOTAL" # add space to make this item first
|
245
|
-
df = pd.concat([df, df_total])
|
246
|
-
|
247
259
|
# * ensure df is grouped to prevent false aggregations
|
248
260
|
df = (
|
249
261
|
df.groupby([df.columns[0], df.columns[1]])
|
@@ -251,6 +263,39 @@ def plot_stacked_bars(
|
|
251
263
|
.sum()
|
252
264
|
.reset_index()
|
253
265
|
)
|
266
|
+
|
267
|
+
# * Sorting logic based on sort_values
|
268
|
+
if sort_values:
|
269
|
+
sort_order = (
|
270
|
+
df.groupby(col_index)[df.columns[2]].sum().sort_values(ascending=False).index
|
271
|
+
)
|
272
|
+
else:
|
273
|
+
sort_order = sorted(df[col_index].unique()) # Alphabetical order
|
274
|
+
df[col_index] = pd.Categorical(df[col_index], categories=sort_order, ordered=True)
|
275
|
+
|
276
|
+
# * add total as aggregation of df
|
277
|
+
if show_total:
|
278
|
+
df_total = df.copy()
|
279
|
+
df_total[col_index] = " TOTAL" # add space to make this item first
|
280
|
+
df = pd.concat([df, df_total])
|
281
|
+
|
282
|
+
# * Convert to categorical with explicit ordering
|
283
|
+
df[col_index] = pd.Categorical(df[col_index], categories=sort_order, ordered=True)
|
284
|
+
|
285
|
+
if top_n_index > 0 and len(sort_order) > top_n_index:
|
286
|
+
top_categories = sort_order[:top_n_index]
|
287
|
+
df[col_index] = df[col_index].apply(lambda x: x if x in top_categories else "<other>")
|
288
|
+
|
289
|
+
unique_colors = sorted(df[col_color].unique())
|
290
|
+
if top_n_color > 0 and len(unique_colors) > top_n_color:
|
291
|
+
top_colors = unique_colors[:top_n_color]
|
292
|
+
df[col_color] = df[col_color].apply(lambda x: x if x in top_colors else "<other>")
|
293
|
+
|
294
|
+
column_colors = assign_column_colors(sorted(df[col_color].unique()), color_palette, null_label)
|
295
|
+
|
296
|
+
# # * assign colors to columns
|
297
|
+
# unique_colors = sorted(df[col_color].unique())
|
298
|
+
# column_colors = assign_column_colors(unique_colors, color_palette, null_label)
|
254
299
|
|
255
300
|
# * calculate n
|
256
301
|
divider = 2 if show_total else 1
|
@@ -264,7 +309,7 @@ def plot_stacked_bars(
|
|
264
309
|
caption = _set_caption(caption)
|
265
310
|
|
266
311
|
# * plot
|
267
|
-
|
312
|
+
fig = px.bar(
|
268
313
|
df,
|
269
314
|
x=col_index if orientation == "v" else df.columns[2],
|
270
315
|
y=df.columns[2] if orientation == "v" else col_index,
|
@@ -277,6 +322,9 @@ def plot_stacked_bars(
|
|
277
322
|
width=width,
|
278
323
|
height=height,
|
279
324
|
color_discrete_map=column_colors, # Use assigned colors
|
325
|
+
category_orders={col_index: list(df[col_index].cat.categories)}, # <- Add this line
|
326
|
+
# category_orders={col_index: df[col_index].categories.tolist() if isinstance(df[col_index].dtype, pd.CategoricalDtype) else sorted(df[col_index].unique())}
|
327
|
+
|
280
328
|
)
|
281
329
|
# * get longest bar
|
282
330
|
bar_max = (
|
@@ -286,14 +334,14 @@ def plot_stacked_bars(
|
|
286
334
|
# * ignore if bar mode is on
|
287
335
|
if not relative:
|
288
336
|
if orientation == "v":
|
289
|
-
|
337
|
+
fig.update_yaxes(range=[0, bar_max])
|
290
338
|
else:
|
291
|
-
|
339
|
+
fig.update_xaxes(range=[0, bar_max])
|
292
340
|
else:
|
293
|
-
|
341
|
+
fig.update_layout(barnorm="percent")
|
294
342
|
|
295
343
|
# * set title properties
|
296
|
-
|
344
|
+
fig.update_layout(
|
297
345
|
title={
|
298
346
|
# 'x': 0.1,
|
299
347
|
"y": 0.95,
|
@@ -308,27 +356,27 @@ def plot_stacked_bars(
|
|
308
356
|
# * set dtick
|
309
357
|
if orientation == "h":
|
310
358
|
if relative:
|
311
|
-
|
312
|
-
|
313
|
-
|
359
|
+
fig.update_xaxes(dtick=5)
|
360
|
+
# bug dticks are ultra dense
|
361
|
+
# elif normalize:
|
362
|
+
# fig.update_xaxes(dtick=0.05)
|
314
363
|
else:
|
315
364
|
if relative:
|
316
|
-
|
317
|
-
elif normalize:
|
318
|
-
|
365
|
+
fig.update_yaxes(dtick=5)
|
366
|
+
# elif normalize:
|
367
|
+
# fig.update_yaxes(dtick=0.05)
|
319
368
|
|
320
369
|
# * show grids, set to smaller distance on pct scale
|
321
|
-
|
322
|
-
|
370
|
+
fig.update_xaxes(showgrid=True, gridwidth=1)
|
371
|
+
fig.update_yaxes(showgrid=True, gridwidth=1)
|
323
372
|
|
324
373
|
# * save to png if path is provided
|
325
374
|
if png_path is not None:
|
326
|
-
|
327
|
-
|
328
|
-
_fig.show(renderer)
|
375
|
+
fig.write_image(Path(png_path).as_posix())
|
329
376
|
|
330
|
-
|
377
|
+
fig.show(renderer=renderer)
|
331
378
|
|
379
|
+
return fig
|
332
380
|
|
333
381
|
|
334
382
|
def plot_bars(
|
@@ -927,7 +975,8 @@ def plot_box(
|
|
927
975
|
fig.show("png")
|
928
976
|
|
929
977
|
if summary:
|
930
|
-
|
978
|
+
# * if only series is provided, col name is None
|
979
|
+
print_summary(ser.to_frame())
|
931
980
|
|
932
981
|
# * save to png if path is provided
|
933
982
|
if png_path is not None:
|
@@ -1141,8 +1190,23 @@ def plot_facet_stacked_bars(
|
|
1141
1190
|
|
1142
1191
|
aggregated_df = aggregate_data(df, top_n_index, top_n_columns, top_n_facet, null_label)
|
1143
1192
|
|
1144
|
-
facets = aggregated_df['facet'].unique()
|
1145
|
-
|
1193
|
+
# facets = aggregated_df['facet'].unique()
|
1194
|
+
facets = sorted(aggregated_df['facet'].unique()) # Ensure facets are sorted consistently
|
1195
|
+
|
1196
|
+
if top_n_columns > 0:
|
1197
|
+
top_columns = aggregated_df.groupby('col', observed=True)['value'].sum().nlargest(top_n_columns).index.tolist()
|
1198
|
+
# aggregated_df['col'] = aggregated_df['col'].apply(lambda x: x if x in top_columns else "<other>")
|
1199
|
+
# aggregated_df['col'] = pd.Categorical(aggregated_df['col'], categories=top_columns + ["<other>"], ordered=True)
|
1200
|
+
# aggregated_df['col'] = pd.Categorical(
|
1201
|
+
# aggregated_df['col'].map(lambda x: x if x in top_columns else "<other>"),
|
1202
|
+
# categories=top_columns + ["<other>"],
|
1203
|
+
# ordered=True
|
1204
|
+
# )
|
1205
|
+
aggregated_df['col'] = aggregated_df['col'].apply(lambda x: x if x in top_columns else "<other>")
|
1206
|
+
|
1207
|
+
|
1208
|
+
# columns = sorted(aggregated_df['col'].unique())
|
1209
|
+
columns = aggregated_df.groupby('col', observed=True)['value'].sum().sort_values(ascending=False).index.tolist()
|
1146
1210
|
column_colors = assign_column_colors(columns, color_palette, null_label)
|
1147
1211
|
|
1148
1212
|
fig = make_subplots(
|
@@ -1189,17 +1253,17 @@ def plot_facet_stacked_bars(
|
|
1189
1253
|
unique_rows = len(aggregated_df)
|
1190
1254
|
axis_details = []
|
1191
1255
|
if top_n_index > 0:
|
1192
|
-
axis_details.append(f"
|
1256
|
+
axis_details.append(f"TOP {top_n_index} [{original_column_names[0]}]")
|
1193
1257
|
else:
|
1194
1258
|
axis_details.append(f"[{original_column_names[0]}]")
|
1195
1259
|
|
1196
1260
|
if top_n_columns > 0:
|
1197
|
-
axis_details.append(f"
|
1261
|
+
axis_details.append(f"TOP {top_n_columns} [{original_column_names[1]}]")
|
1198
1262
|
else:
|
1199
1263
|
axis_details.append(f"[{original_column_names[1]}]")
|
1200
1264
|
|
1201
1265
|
if top_n_facet > 0:
|
1202
|
-
axis_details.append(f"
|
1266
|
+
axis_details.append(f"TOP {top_n_facet} [{original_column_names[2]}]")
|
1203
1267
|
else:
|
1204
1268
|
axis_details.append(f"[{original_column_names[2]}]")
|
1205
1269
|
|
@@ -1218,6 +1282,6 @@ def plot_facet_stacked_bars(
|
|
1218
1282
|
png_path = Path(png_path)
|
1219
1283
|
fig.write_image(str(png_path))
|
1220
1284
|
|
1221
|
-
fig.show(renderer)
|
1285
|
+
fig.show(renderer=renderer)
|
1222
1286
|
|
1223
1287
|
return fig
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: pandas-plots
|
3
|
-
Version: 0.12.
|
3
|
+
Version: 0.12.6
|
4
4
|
Summary: A collection of helper for table handling and visualization
|
5
5
|
Home-page: https://github.com/smeisegeier/pandas-plots
|
6
6
|
Author: smeisegeier
|
@@ -20,7 +20,7 @@ Requires-Python: >=3.10
|
|
20
20
|
Description-Content-Type: text/markdown
|
21
21
|
License-File: LICENSE
|
22
22
|
Requires-Dist: pandas>=2.0.0
|
23
|
-
Requires-Dist: plotly
|
23
|
+
Requires-Dist: plotly<6
|
24
24
|
Requires-Dist: matplotlib>=3.8.2
|
25
25
|
Requires-Dist: matplotlib-venn==0.11.10
|
26
26
|
Requires-Dist: seaborn>=0.13.2
|
@@ -96,7 +96,7 @@ tbl.show_num_df(
|
|
96
96
|
- `plot_histogram()` histogram for one or more **numerical** columns
|
97
97
|
- `plot_joints()` a joint plot for **exactly two numerical** columns
|
98
98
|
- `plot_quadrants()` quickly shows a 2x2 heatmap
|
99
|
-
- 🆕 `
|
99
|
+
- 🆕 `plot_facet_stacked_bars()` shows stacked bars for a facet value as subplots
|
100
100
|
<br>
|
101
101
|
|
102
102
|
- `ven` offers functions for _venn diagrams_
|
@@ -0,0 +1,11 @@
|
|
1
|
+
pandas_plots/hlp.py,sha256=N6NrbFagVMMX-ZnV0rIBEz82SeSoOkksfMcCap55W7E,16588
|
2
|
+
pandas_plots/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
|
3
|
+
pandas_plots/pls.py,sha256=DsFnWbGNmMnZ8a2qnZFwXH6VekwPFaIwZEQ9TVp6xCg,43997
|
4
|
+
pandas_plots/tbl.py,sha256=4VvjLisPT1gSvgsLClcrhC7LIJ-_FPNla8nomGflGag,30509
|
5
|
+
pandas_plots/ven.py,sha256=2x3ACo2vSfO3q6fv-UdDQ0h1SJyt8WChBGgE5SDCdCk,11673
|
6
|
+
pandas_plots-0.12.6.dist-info/LICENSE,sha256=6KQ5KVAAhRaB-JJKpX4cefKvRZRgI7GUPc92_2d31XY,1051
|
7
|
+
pandas_plots-0.12.6.dist-info/METADATA,sha256=-mCMgoWTwG6HSL8JtuYvwM1LCkzglJm3aIocaUMijO4,7358
|
8
|
+
pandas_plots-0.12.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
9
|
+
pandas_plots-0.12.6.dist-info/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
|
10
|
+
pandas_plots-0.12.6.dist-info/top_level.txt,sha256=XnaNuIHBqMmCeh_U7nKOYTwFue_SIA0wxuDgdPmnnSk,13
|
11
|
+
pandas_plots-0.12.6.dist-info/RECORD,,
|
@@ -0,0 +1,76 @@
|
|
1
|
+
import pandas as pd
|
2
|
+
import re
|
3
|
+
|
4
|
+
|
5
|
+
def remove_pii(
|
6
|
+
series: pd.Series,
|
7
|
+
verbose: bool = True,
|
8
|
+
logging: bool = False,
|
9
|
+
custom_regex="",
|
10
|
+
) -> pd.Index:
|
11
|
+
"""
|
12
|
+
Remove personally identifiable information (PII) from the given column.
|
13
|
+
|
14
|
+
Parameters:
|
15
|
+
- series: A pandas Series representing a column in a DataFrame.
|
16
|
+
- verbose: If True, print pii items
|
17
|
+
- logging: If True, write pii items into the file .pii.log
|
18
|
+
- custom_regex: Regex that is injected into detection
|
19
|
+
|
20
|
+
Returns:
|
21
|
+
- index object with indexes of all pii items
|
22
|
+
|
23
|
+
Remarks:
|
24
|
+
- df.drop(axis=0, index=result, inplace=True)
|
25
|
+
"""
|
26
|
+
|
27
|
+
# * reject empty columns
|
28
|
+
assert len(series) > 0
|
29
|
+
|
30
|
+
col = series.copy()
|
31
|
+
|
32
|
+
# * na must be dropped to ensure processsing
|
33
|
+
col.dropna(inplace=True)
|
34
|
+
|
35
|
+
# * find terms
|
36
|
+
_terms = frozenset(["lösch", "herr", "frau", "strasse", "klinik"])
|
37
|
+
idx_terms = col[
|
38
|
+
col.str.contains(
|
39
|
+
"|".join(_terms),
|
40
|
+
case=False,
|
41
|
+
regex=True,
|
42
|
+
)
|
43
|
+
].index
|
44
|
+
|
45
|
+
# # * optional: search for terms in whole df
|
46
|
+
# df.apply(lambda row: row.astype(str).str.contains('test', case=False, regex=True).any(), axis=1)
|
47
|
+
|
48
|
+
# # * find dates
|
49
|
+
ptr_date = r"\d{2}\.\d{2}\.\d{4}"
|
50
|
+
idx_date = col[col.str.contains(ptr_date, regex=True)].index
|
51
|
+
|
52
|
+
# * dr
|
53
|
+
ptr_dr = r"[D|d][R|r]\. | Fr\. | Hr\. | PD "
|
54
|
+
idx_dr = col[col.str.contains(ptr_dr, regex=True)].index
|
55
|
+
|
56
|
+
# * custom
|
57
|
+
idx_custom = (
|
58
|
+
col[col.str.contains(custom_regex, regex=True)].index
|
59
|
+
if custom_regex
|
60
|
+
else pd.Index([])
|
61
|
+
)
|
62
|
+
|
63
|
+
idx_all = idx_terms.union(idx_date).union(idx_dr).union(idx_custom)
|
64
|
+
|
65
|
+
if verbose:
|
66
|
+
# print(f"found: {idx_dr.__len__()} dr | {idx_date.__len__()} date | {idx_terms.__len__()} terms")
|
67
|
+
print(f"found {idx_all.__len__():_} pii items:")
|
68
|
+
print(col.loc[idx_all].tolist())
|
69
|
+
|
70
|
+
if logging: # Assuming logging is defined and has the correct value
|
71
|
+
data = col.loc[idx_all] # Assuming col and idx_all are defined
|
72
|
+
with open(".pii.log", "w") as f:
|
73
|
+
# ! when using str(), it will give only a summary!
|
74
|
+
f.write(data.to_string(index=True))
|
75
|
+
|
76
|
+
return idx_all
|
@@ -1,10 +0,0 @@
|
|
1
|
-
pandas_plots/hlp.py,sha256=N6NrbFagVMMX-ZnV0rIBEz82SeSoOkksfMcCap55W7E,16588
|
2
|
-
pandas_plots/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
|
3
|
-
pandas_plots/pls.py,sha256=isveg6_frLZC3Gt3VEsdOLiLw7aTf3riUahmJLHiEq8,40265
|
4
|
-
pandas_plots/tbl.py,sha256=4VvjLisPT1gSvgsLClcrhC7LIJ-_FPNla8nomGflGag,30509
|
5
|
-
pandas_plots/ven.py,sha256=2x3ACo2vSfO3q6fv-UdDQ0h1SJyt8WChBGgE5SDCdCk,11673
|
6
|
-
pandas_plots-0.12.4.dist-info/LICENSE,sha256=6KQ5KVAAhRaB-JJKpX4cefKvRZRgI7GUPc92_2d31XY,1051
|
7
|
-
pandas_plots-0.12.4.dist-info/METADATA,sha256=WZUfWOid_eYMtuS2V_P_C_ChaD1dTqDfuectlxzAJe8,7358
|
8
|
-
pandas_plots-0.12.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
9
|
-
pandas_plots-0.12.4.dist-info/top_level.txt,sha256=XnaNuIHBqMmCeh_U7nKOYTwFue_SIA0wxuDgdPmnnSk,13
|
10
|
-
pandas_plots-0.12.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|