pandas-plots 0.12.5__tar.gz → 0.12.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: pandas-plots
3
- Version: 0.12.5
3
+ Version: 0.12.6
4
4
  Summary: A collection of helper for table handling and visualization
5
5
  Home-page: https://github.com/smeisegeier/pandas-plots
6
6
  Author: smeisegeier
@@ -1,6 +1,6 @@
1
1
  [metadata]
2
2
  name = pandas-plots
3
- version = 0.12.5
3
+ version = 0.12.6
4
4
  author = smeisegeier
5
5
  author_email = dexterDSDo@googlemail.com
6
6
  description = A collection of helper for table handling and visualization
@@ -0,0 +1,76 @@
1
+ import pandas as pd
2
+ import re
3
+
4
+
5
+ def remove_pii(
6
+ series: pd.Series,
7
+ verbose: bool = True,
8
+ logging: bool = False,
9
+ custom_regex="",
10
+ ) -> pd.Index:
11
+ """
12
+ Remove personally identifiable information (PII) from the given column.
13
+
14
+ Parameters:
15
+ - series: A pandas Series representing a column in a DataFrame.
16
+ - verbose: If True, print pii items
17
+ - logging: If True, write pii items into the file .pii.log
18
+ - custom_regex: Regex that is injected into detection
19
+
20
+ Returns:
21
+ - index object with indexes of all pii items
22
+
23
+ Remarks:
24
+ - df.drop(axis=0, index=result, inplace=True)
25
+ """
26
+
27
+ # * reject empty columns
28
+ assert len(series) > 0
29
+
30
+ col = series.copy()
31
+
32
+ # * na must be dropped to ensure processsing
33
+ col.dropna(inplace=True)
34
+
35
+ # * find terms
36
+ _terms = frozenset(["lösch", "herr", "frau", "strasse", "klinik"])
37
+ idx_terms = col[
38
+ col.str.contains(
39
+ "|".join(_terms),
40
+ case=False,
41
+ regex=True,
42
+ )
43
+ ].index
44
+
45
+ # # * optional: search for terms in whole df
46
+ # df.apply(lambda row: row.astype(str).str.contains('test', case=False, regex=True).any(), axis=1)
47
+
48
+ # # * find dates
49
+ ptr_date = r"\d{2}\.\d{2}\.\d{4}"
50
+ idx_date = col[col.str.contains(ptr_date, regex=True)].index
51
+
52
+ # * dr
53
+ ptr_dr = r"[D|d][R|r]\. | Fr\. | Hr\. | PD "
54
+ idx_dr = col[col.str.contains(ptr_dr, regex=True)].index
55
+
56
+ # * custom
57
+ idx_custom = (
58
+ col[col.str.contains(custom_regex, regex=True)].index
59
+ if custom_regex
60
+ else pd.Index([])
61
+ )
62
+
63
+ idx_all = idx_terms.union(idx_date).union(idx_dr).union(idx_custom)
64
+
65
+ if verbose:
66
+ # print(f"found: {idx_dr.__len__()} dr | {idx_date.__len__()} date | {idx_terms.__len__()} terms")
67
+ print(f"found {idx_all.__len__():_} pii items:")
68
+ print(col.loc[idx_all].tolist())
69
+
70
+ if logging: # Assuming logging is defined and has the correct value
71
+ data = col.loc[idx_all] # Assuming col and idx_all are defined
72
+ with open(".pii.log", "w") as f:
73
+ # ! when using str(), it will give only a summary!
74
+ f.write(data.to_string(index=True))
75
+
76
+ return idx_all
@@ -256,16 +256,6 @@ def plot_stacked_bars(
256
256
  col_index = df.columns[0] if not swap else df.columns[1]
257
257
  col_color = df.columns[1] if not swap else df.columns[0]
258
258
 
259
- # * assign colors to columns
260
- unique_colors = sorted(df[col_color].unique())
261
- column_colors = assign_column_colors(unique_colors, color_palette, null_label)
262
-
263
- # * add total as aggregation of df
264
- if show_total:
265
- df_total = df.copy()
266
- df_total[col_index] = " TOTAL" # add space to make this item first
267
- df = pd.concat([df, df_total])
268
-
269
259
  # * ensure df is grouped to prevent false aggregations
270
260
  df = (
271
261
  df.groupby([df.columns[0], df.columns[1]])
@@ -281,10 +271,31 @@ def plot_stacked_bars(
281
271
  )
282
272
  else:
283
273
  sort_order = sorted(df[col_index].unique()) # Alphabetical order
274
+ df[col_index] = pd.Categorical(df[col_index], categories=sort_order, ordered=True)
275
+
276
+ # * add total as aggregation of df
277
+ if show_total:
278
+ df_total = df.copy()
279
+ df_total[col_index] = " TOTAL" # add space to make this item first
280
+ df = pd.concat([df, df_total])
284
281
 
285
282
  # * Convert to categorical with explicit ordering
286
283
  df[col_index] = pd.Categorical(df[col_index], categories=sort_order, ordered=True)
287
284
 
285
+ if top_n_index > 0 and len(sort_order) > top_n_index:
286
+ top_categories = sort_order[:top_n_index]
287
+ df[col_index] = df[col_index].apply(lambda x: x if x in top_categories else "<other>")
288
+
289
+ unique_colors = sorted(df[col_color].unique())
290
+ if top_n_color > 0 and len(unique_colors) > top_n_color:
291
+ top_colors = unique_colors[:top_n_color]
292
+ df[col_color] = df[col_color].apply(lambda x: x if x in top_colors else "<other>")
293
+
294
+ column_colors = assign_column_colors(sorted(df[col_color].unique()), color_palette, null_label)
295
+
296
+ # # * assign colors to columns
297
+ # unique_colors = sorted(df[col_color].unique())
298
+ # column_colors = assign_column_colors(unique_colors, color_palette, null_label)
288
299
 
289
300
  # * calculate n
290
301
  divider = 2 if show_total else 1
@@ -312,6 +323,8 @@ def plot_stacked_bars(
312
323
  height=height,
313
324
  color_discrete_map=column_colors, # Use assigned colors
314
325
  category_orders={col_index: list(df[col_index].cat.categories)}, # <- Add this line
326
+ # category_orders={col_index: df[col_index].categories.tolist() if isinstance(df[col_index].dtype, pd.CategoricalDtype) else sorted(df[col_index].unique())}
327
+
315
328
  )
316
329
  # * get longest bar
317
330
  bar_max = (
@@ -344,13 +357,14 @@ def plot_stacked_bars(
344
357
  if orientation == "h":
345
358
  if relative:
346
359
  fig.update_xaxes(dtick=5)
347
- elif normalize:
348
- fig.update_xaxes(dtick=0.05)
360
+ # bug dticks are ultra dense
361
+ # elif normalize:
362
+ # fig.update_xaxes(dtick=0.05)
349
363
  else:
350
364
  if relative:
351
365
  fig.update_yaxes(dtick=5)
352
- elif normalize:
353
- fig.update_yaxes(dtick=0.05)
366
+ # elif normalize:
367
+ # fig.update_yaxes(dtick=0.05)
354
368
 
355
369
  # * show grids, set to smaller distance on pct scale
356
370
  fig.update_xaxes(showgrid=True, gridwidth=1)
@@ -961,7 +975,8 @@ def plot_box(
961
975
  fig.show("png")
962
976
 
963
977
  if summary:
964
- print_summary(ser)
978
+ # * if only series is provided, col name is None
979
+ print_summary(ser.to_frame())
965
980
 
966
981
  # * save to png if path is provided
967
982
  if png_path is not None:
@@ -1175,8 +1190,23 @@ def plot_facet_stacked_bars(
1175
1190
 
1176
1191
  aggregated_df = aggregate_data(df, top_n_index, top_n_columns, top_n_facet, null_label)
1177
1192
 
1178
- facets = aggregated_df['facet'].unique()
1179
- columns = sorted(aggregated_df['col'].unique())
1193
+ # facets = aggregated_df['facet'].unique()
1194
+ facets = sorted(aggregated_df['facet'].unique()) # Ensure facets are sorted consistently
1195
+
1196
+ if top_n_columns > 0:
1197
+ top_columns = aggregated_df.groupby('col', observed=True)['value'].sum().nlargest(top_n_columns).index.tolist()
1198
+ # aggregated_df['col'] = aggregated_df['col'].apply(lambda x: x if x in top_columns else "<other>")
1199
+ # aggregated_df['col'] = pd.Categorical(aggregated_df['col'], categories=top_columns + ["<other>"], ordered=True)
1200
+ # aggregated_df['col'] = pd.Categorical(
1201
+ # aggregated_df['col'].map(lambda x: x if x in top_columns else "<other>"),
1202
+ # categories=top_columns + ["<other>"],
1203
+ # ordered=True
1204
+ # )
1205
+ aggregated_df['col'] = aggregated_df['col'].apply(lambda x: x if x in top_columns else "<other>")
1206
+
1207
+
1208
+ # columns = sorted(aggregated_df['col'].unique())
1209
+ columns = aggregated_df.groupby('col', observed=True)['value'].sum().sort_values(ascending=False).index.tolist()
1180
1210
  column_colors = assign_column_colors(columns, color_palette, null_label)
1181
1211
 
1182
1212
  fig = make_subplots(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: pandas-plots
3
- Version: 0.12.5
3
+ Version: 0.12.6
4
4
  Summary: A collection of helper for table handling and visualization
5
5
  Home-page: https://github.com/smeisegeier/pandas-plots
6
6
  Author: smeisegeier
@@ -10,5 +10,6 @@ src/pandas_plots/ven.py
10
10
  src/pandas_plots.egg-info/PKG-INFO
11
11
  src/pandas_plots.egg-info/SOURCES.txt
12
12
  src/pandas_plots.egg-info/dependency_links.txt
13
+ src/pandas_plots.egg-info/pii.py
13
14
  src/pandas_plots.egg-info/requires.txt
14
15
  src/pandas_plots.egg-info/top_level.txt
File without changes
File without changes