pandas-plots 0.12.4__py3-none-any.whl → 0.12.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pandas_plots/pls.py CHANGED
@@ -78,7 +78,7 @@ def assign_column_colors(columns, color_palette, null_label):
78
78
  raise ValueError(f"Invalid color palette: {color_palette}")
79
79
 
80
80
  colors = {col: palette[i % len(palette)] for i, col in enumerate(sorted(columns))}
81
- colors[null_label] = "gray"
81
+ colors[null_label] = "lightgray"
82
82
  return colors
83
83
 
84
84
  ### main functions
@@ -192,12 +192,32 @@ def plot_stacked_bars(
192
192
  ) -> object:
193
193
  """
194
194
  Generates a stacked bar plot using the provided DataFrame.
195
- Updated to assign colors using `assign_column_colors` with nulls colored grey.
196
195
 
197
196
  Parameters:
198
- All parameters are similar to the original function, with the addition of:
199
- - color_palette: str - Name of the color palette.
200
- - null_label: str - Label for null values.
197
+ - df (pd.DataFrame): The input DataFrame with at least two categorical columns and one numerical column.
198
+ - top_n_index (int): Limit the number of categories displayed on the index axis.
199
+ - top_n_color (int): Limit the number of categories displayed in the color legend.
200
+ - dropna (bool): If True, removes rows with missing values; otherwise, replaces them with `null_label`.
201
+ - swap (bool): If True, swaps the first two columns.
202
+ - normalize (bool): If True, normalizes numerical values between 0 and 1.
203
+ - relative (bool): If True, normalizes the bars to a percentage scale.
204
+ - orientation (Literal["h", "v"]): Defines the orientation of the bars ("v" for vertical, "h" for horizontal).
205
+ - height (int): Height of the plot.
206
+ - width (int): Width of the plot.
207
+ - title (str): Custom title for the plot.
208
+ - renderer (Literal["png", "svg", None]): Defines the output format.
209
+ - caption (str): Optional caption for additional context.
210
+ - sort_values (bool):
211
+ - If True, sorts bars by the sum of their values (descending).
212
+ - If False, sorts bars alphabetically.
213
+ - show_total (bool): If True, adds a row with the total sum of all categories.
214
+ - precision (int): Number of decimal places for numerical values.
215
+ - png_path (Path | str): If specified, saves the plot as a PNG file.
216
+ - color_palette (str): Name of the color palette to use.
217
+ - null_label (str): Label for null values.
218
+
219
+ Returns:
220
+ - A Plotly figure object representing the stacked bar chart.
201
221
  """
202
222
  BAR_LENGTH_MULTIPLIER = 1.05
203
223
 
@@ -211,6 +231,8 @@ def plot_stacked_bars(
211
231
  print("❌ first 2 columns must be str")
212
232
  return
213
233
 
234
+ df = df.copy() # Copy the input DataFrame to avoid modifying the original
235
+
214
236
  # * add count column[2] as a service if none is present
215
237
  if len(df.columns) == 2:
216
238
  df["cnt"] = 1
@@ -234,16 +256,6 @@ def plot_stacked_bars(
234
256
  col_index = df.columns[0] if not swap else df.columns[1]
235
257
  col_color = df.columns[1] if not swap else df.columns[0]
236
258
 
237
- # * assign colors to columns
238
- unique_colors = sorted(df[col_color].unique())
239
- column_colors = assign_column_colors(unique_colors, color_palette, null_label)
240
-
241
- # * add total as aggregation of df
242
- if show_total:
243
- df_total = df.copy()
244
- df_total[col_index] = " TOTAL" # add space to make this item first
245
- df = pd.concat([df, df_total])
246
-
247
259
  # * ensure df is grouped to prevent false aggregations
248
260
  df = (
249
261
  df.groupby([df.columns[0], df.columns[1]])
@@ -251,6 +263,39 @@ def plot_stacked_bars(
251
263
  .sum()
252
264
  .reset_index()
253
265
  )
266
+
267
+ # * Sorting logic based on sort_values
268
+ if sort_values:
269
+ sort_order = (
270
+ df.groupby(col_index)[df.columns[2]].sum().sort_values(ascending=False).index
271
+ )
272
+ else:
273
+ sort_order = sorted(df[col_index].unique()) # Alphabetical order
274
+ df[col_index] = pd.Categorical(df[col_index], categories=sort_order, ordered=True)
275
+
276
+ # * add total as aggregation of df
277
+ if show_total:
278
+ df_total = df.copy()
279
+ df_total[col_index] = " TOTAL" # add space to make this item first
280
+ df = pd.concat([df, df_total])
281
+
282
+ # * Convert to categorical with explicit ordering
283
+ df[col_index] = pd.Categorical(df[col_index], categories=sort_order, ordered=True)
284
+
285
+ if top_n_index > 0 and len(sort_order) > top_n_index:
286
+ top_categories = sort_order[:top_n_index]
287
+ df[col_index] = df[col_index].apply(lambda x: x if x in top_categories else "<other>")
288
+
289
+ unique_colors = sorted(df[col_color].unique())
290
+ if top_n_color > 0 and len(unique_colors) > top_n_color:
291
+ top_colors = unique_colors[:top_n_color]
292
+ df[col_color] = df[col_color].apply(lambda x: x if x in top_colors else "<other>")
293
+
294
+ column_colors = assign_column_colors(sorted(df[col_color].unique()), color_palette, null_label)
295
+
296
+ # # * assign colors to columns
297
+ # unique_colors = sorted(df[col_color].unique())
298
+ # column_colors = assign_column_colors(unique_colors, color_palette, null_label)
254
299
 
255
300
  # * calculate n
256
301
  divider = 2 if show_total else 1
@@ -264,7 +309,7 @@ def plot_stacked_bars(
264
309
  caption = _set_caption(caption)
265
310
 
266
311
  # * plot
267
- _fig = px.bar(
312
+ fig = px.bar(
268
313
  df,
269
314
  x=col_index if orientation == "v" else df.columns[2],
270
315
  y=df.columns[2] if orientation == "v" else col_index,
@@ -277,6 +322,9 @@ def plot_stacked_bars(
277
322
  width=width,
278
323
  height=height,
279
324
  color_discrete_map=column_colors, # Use assigned colors
325
+ category_orders={col_index: list(df[col_index].cat.categories)}, # <- Add this line
326
+ # category_orders={col_index: df[col_index].categories.tolist() if isinstance(df[col_index].dtype, pd.CategoricalDtype) else sorted(df[col_index].unique())}
327
+
280
328
  )
281
329
  # * get longest bar
282
330
  bar_max = (
@@ -286,14 +334,14 @@ def plot_stacked_bars(
286
334
  # * ignore if bar mode is on
287
335
  if not relative:
288
336
  if orientation == "v":
289
- _fig.update_yaxes(range=[0, bar_max])
337
+ fig.update_yaxes(range=[0, bar_max])
290
338
  else:
291
- _fig.update_xaxes(range=[0, bar_max])
339
+ fig.update_xaxes(range=[0, bar_max])
292
340
  else:
293
- _fig.update_layout(barnorm="percent")
341
+ fig.update_layout(barnorm="percent")
294
342
 
295
343
  # * set title properties
296
- _fig.update_layout(
344
+ fig.update_layout(
297
345
  title={
298
346
  # 'x': 0.1,
299
347
  "y": 0.95,
@@ -308,27 +356,27 @@ def plot_stacked_bars(
308
356
  # * set dtick
309
357
  if orientation == "h":
310
358
  if relative:
311
- _fig.update_xaxes(dtick=5)
312
- elif normalize:
313
- _fig.update_xaxes(dtick=0.05)
359
+ fig.update_xaxes(dtick=5)
360
+ # bug dticks are ultra dense
361
+ # elif normalize:
362
+ # fig.update_xaxes(dtick=0.05)
314
363
  else:
315
364
  if relative:
316
- _fig.update_yaxes(dtick=5)
317
- elif normalize:
318
- _fig.update_yaxes(dtick=0.05)
365
+ fig.update_yaxes(dtick=5)
366
+ # elif normalize:
367
+ # fig.update_yaxes(dtick=0.05)
319
368
 
320
369
  # * show grids, set to smaller distance on pct scale
321
- _fig.update_xaxes(showgrid=True, gridwidth=1)
322
- _fig.update_yaxes(showgrid=True, gridwidth=1)
370
+ fig.update_xaxes(showgrid=True, gridwidth=1)
371
+ fig.update_yaxes(showgrid=True, gridwidth=1)
323
372
 
324
373
  # * save to png if path is provided
325
374
  if png_path is not None:
326
- _fig.write_image(Path(png_path).as_posix())
327
-
328
- _fig.show(renderer)
375
+ fig.write_image(Path(png_path).as_posix())
329
376
 
330
- return _fig
377
+ fig.show(renderer=renderer)
331
378
 
379
+ return fig
332
380
 
333
381
 
334
382
  def plot_bars(
@@ -927,7 +975,8 @@ def plot_box(
927
975
  fig.show("png")
928
976
 
929
977
  if summary:
930
- print_summary(ser)
978
+ # * if only series is provided, col name is None
979
+ print_summary(ser.to_frame())
931
980
 
932
981
  # * save to png if path is provided
933
982
  if png_path is not None:
@@ -1141,8 +1190,23 @@ def plot_facet_stacked_bars(
1141
1190
 
1142
1191
  aggregated_df = aggregate_data(df, top_n_index, top_n_columns, top_n_facet, null_label)
1143
1192
 
1144
- facets = aggregated_df['facet'].unique()
1145
- columns = sorted(aggregated_df['col'].unique())
1193
+ # facets = aggregated_df['facet'].unique()
1194
+ facets = sorted(aggregated_df['facet'].unique()) # Ensure facets are sorted consistently
1195
+
1196
+ if top_n_columns > 0:
1197
+ top_columns = aggregated_df.groupby('col', observed=True)['value'].sum().nlargest(top_n_columns).index.tolist()
1198
+ # aggregated_df['col'] = aggregated_df['col'].apply(lambda x: x if x in top_columns else "<other>")
1199
+ # aggregated_df['col'] = pd.Categorical(aggregated_df['col'], categories=top_columns + ["<other>"], ordered=True)
1200
+ # aggregated_df['col'] = pd.Categorical(
1201
+ # aggregated_df['col'].map(lambda x: x if x in top_columns else "<other>"),
1202
+ # categories=top_columns + ["<other>"],
1203
+ # ordered=True
1204
+ # )
1205
+ aggregated_df['col'] = aggregated_df['col'].apply(lambda x: x if x in top_columns else "<other>")
1206
+
1207
+
1208
+ # columns = sorted(aggregated_df['col'].unique())
1209
+ columns = aggregated_df.groupby('col', observed=True)['value'].sum().sort_values(ascending=False).index.tolist()
1146
1210
  column_colors = assign_column_colors(columns, color_palette, null_label)
1147
1211
 
1148
1212
  fig = make_subplots(
@@ -1189,17 +1253,17 @@ def plot_facet_stacked_bars(
1189
1253
  unique_rows = len(aggregated_df)
1190
1254
  axis_details = []
1191
1255
  if top_n_index > 0:
1192
- axis_details.append(f"top {top_n_index} [{original_column_names[0]}]")
1256
+ axis_details.append(f"TOP {top_n_index} [{original_column_names[0]}]")
1193
1257
  else:
1194
1258
  axis_details.append(f"[{original_column_names[0]}]")
1195
1259
 
1196
1260
  if top_n_columns > 0:
1197
- axis_details.append(f"top {top_n_columns} [{original_column_names[1]}]")
1261
+ axis_details.append(f"TOP {top_n_columns} [{original_column_names[1]}]")
1198
1262
  else:
1199
1263
  axis_details.append(f"[{original_column_names[1]}]")
1200
1264
 
1201
1265
  if top_n_facet > 0:
1202
- axis_details.append(f"top {top_n_facet} [{original_column_names[2]}]")
1266
+ axis_details.append(f"TOP {top_n_facet} [{original_column_names[2]}]")
1203
1267
  else:
1204
1268
  axis_details.append(f"[{original_column_names[2]}]")
1205
1269
 
@@ -1218,6 +1282,6 @@ def plot_facet_stacked_bars(
1218
1282
  png_path = Path(png_path)
1219
1283
  fig.write_image(str(png_path))
1220
1284
 
1221
- fig.show(renderer)
1285
+ fig.show(renderer=renderer)
1222
1286
 
1223
1287
  return fig
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: pandas-plots
3
- Version: 0.12.4
3
+ Version: 0.12.6
4
4
  Summary: A collection of helper for table handling and visualization
5
5
  Home-page: https://github.com/smeisegeier/pandas-plots
6
6
  Author: smeisegeier
@@ -20,7 +20,7 @@ Requires-Python: >=3.10
20
20
  Description-Content-Type: text/markdown
21
21
  License-File: LICENSE
22
22
  Requires-Dist: pandas>=2.0.0
23
- Requires-Dist: plotly>=5.18.0
23
+ Requires-Dist: plotly<6
24
24
  Requires-Dist: matplotlib>=3.8.2
25
25
  Requires-Dist: matplotlib-venn==0.11.10
26
26
  Requires-Dist: seaborn>=0.13.2
@@ -96,7 +96,7 @@ tbl.show_num_df(
96
96
  - `plot_histogram()` histogram for one or more **numerical** columns
97
97
  - `plot_joints()` a joint plot for **exactly two numerical** columns
98
98
  - `plot_quadrants()` quickly shows a 2x2 heatmap
99
- - 🆕 `plot_stacked_bars()` shows stacked bars for a facet value as subplots
99
+ - 🆕 `plot_facet_stacked_bars()` shows stacked bars for a facet value as subplots
100
100
  <br>
101
101
 
102
102
  - `ven` offers functions for _venn diagrams_
@@ -0,0 +1,11 @@
1
+ pandas_plots/hlp.py,sha256=N6NrbFagVMMX-ZnV0rIBEz82SeSoOkksfMcCap55W7E,16588
2
+ pandas_plots/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
3
+ pandas_plots/pls.py,sha256=DsFnWbGNmMnZ8a2qnZFwXH6VekwPFaIwZEQ9TVp6xCg,43997
4
+ pandas_plots/tbl.py,sha256=4VvjLisPT1gSvgsLClcrhC7LIJ-_FPNla8nomGflGag,30509
5
+ pandas_plots/ven.py,sha256=2x3ACo2vSfO3q6fv-UdDQ0h1SJyt8WChBGgE5SDCdCk,11673
6
+ pandas_plots-0.12.6.dist-info/LICENSE,sha256=6KQ5KVAAhRaB-JJKpX4cefKvRZRgI7GUPc92_2d31XY,1051
7
+ pandas_plots-0.12.6.dist-info/METADATA,sha256=-mCMgoWTwG6HSL8JtuYvwM1LCkzglJm3aIocaUMijO4,7358
8
+ pandas_plots-0.12.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
9
+ pandas_plots-0.12.6.dist-info/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
10
+ pandas_plots-0.12.6.dist-info/top_level.txt,sha256=XnaNuIHBqMmCeh_U7nKOYTwFue_SIA0wxuDgdPmnnSk,13
11
+ pandas_plots-0.12.6.dist-info/RECORD,,
@@ -0,0 +1,76 @@
1
+ import pandas as pd
2
+ import re
3
+
4
+
5
+ def remove_pii(
6
+ series: pd.Series,
7
+ verbose: bool = True,
8
+ logging: bool = False,
9
+ custom_regex="",
10
+ ) -> pd.Index:
11
+ """
12
+ Remove personally identifiable information (PII) from the given column.
13
+
14
+ Parameters:
15
+ - series: A pandas Series representing a column in a DataFrame.
16
+ - verbose: If True, print pii items
17
+ - logging: If True, write pii items into the file .pii.log
18
+ - custom_regex: Regex that is injected into detection
19
+
20
+ Returns:
21
+ - index object with indexes of all pii items
22
+
23
+ Remarks:
24
+ - df.drop(axis=0, index=result, inplace=True)
25
+ """
26
+
27
+ # * reject empty columns
28
+ assert len(series) > 0
29
+
30
+ col = series.copy()
31
+
32
+ # * na must be dropped to ensure processsing
33
+ col.dropna(inplace=True)
34
+
35
+ # * find terms
36
+ _terms = frozenset(["lösch", "herr", "frau", "strasse", "klinik"])
37
+ idx_terms = col[
38
+ col.str.contains(
39
+ "|".join(_terms),
40
+ case=False,
41
+ regex=True,
42
+ )
43
+ ].index
44
+
45
+ # # * optional: search for terms in whole df
46
+ # df.apply(lambda row: row.astype(str).str.contains('test', case=False, regex=True).any(), axis=1)
47
+
48
+ # # * find dates
49
+ ptr_date = r"\d{2}\.\d{2}\.\d{4}"
50
+ idx_date = col[col.str.contains(ptr_date, regex=True)].index
51
+
52
+ # * dr
53
+ ptr_dr = r"[D|d][R|r]\. | Fr\. | Hr\. | PD "
54
+ idx_dr = col[col.str.contains(ptr_dr, regex=True)].index
55
+
56
+ # * custom
57
+ idx_custom = (
58
+ col[col.str.contains(custom_regex, regex=True)].index
59
+ if custom_regex
60
+ else pd.Index([])
61
+ )
62
+
63
+ idx_all = idx_terms.union(idx_date).union(idx_dr).union(idx_custom)
64
+
65
+ if verbose:
66
+ # print(f"found: {idx_dr.__len__()} dr | {idx_date.__len__()} date | {idx_terms.__len__()} terms")
67
+ print(f"found {idx_all.__len__():_} pii items:")
68
+ print(col.loc[idx_all].tolist())
69
+
70
+ if logging: # Assuming logging is defined and has the correct value
71
+ data = col.loc[idx_all] # Assuming col and idx_all are defined
72
+ with open(".pii.log", "w") as f:
73
+ # ! when using str(), it will give only a summary!
74
+ f.write(data.to_string(index=True))
75
+
76
+ return idx_all
@@ -1,10 +0,0 @@
1
- pandas_plots/hlp.py,sha256=N6NrbFagVMMX-ZnV0rIBEz82SeSoOkksfMcCap55W7E,16588
2
- pandas_plots/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
3
- pandas_plots/pls.py,sha256=isveg6_frLZC3Gt3VEsdOLiLw7aTf3riUahmJLHiEq8,40265
4
- pandas_plots/tbl.py,sha256=4VvjLisPT1gSvgsLClcrhC7LIJ-_FPNla8nomGflGag,30509
5
- pandas_plots/ven.py,sha256=2x3ACo2vSfO3q6fv-UdDQ0h1SJyt8WChBGgE5SDCdCk,11673
6
- pandas_plots-0.12.4.dist-info/LICENSE,sha256=6KQ5KVAAhRaB-JJKpX4cefKvRZRgI7GUPc92_2d31XY,1051
7
- pandas_plots-0.12.4.dist-info/METADATA,sha256=WZUfWOid_eYMtuS2V_P_C_ChaD1dTqDfuectlxzAJe8,7358
8
- pandas_plots-0.12.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
9
- pandas_plots-0.12.4.dist-info/top_level.txt,sha256=XnaNuIHBqMmCeh_U7nKOYTwFue_SIA0wxuDgdPmnnSk,13
10
- pandas_plots-0.12.4.dist-info/RECORD,,