pandas-plots 0.12.5__tar.gz → 0.12.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: pandas-plots
3
- Version: 0.12.5
3
+ Version: 0.12.7
4
4
  Summary: A collection of helper for table handling and visualization
5
5
  Home-page: https://github.com/smeisegeier/pandas-plots
6
6
  Author: smeisegeier
@@ -1,6 +1,6 @@
1
1
  [metadata]
2
2
  name = pandas-plots
3
- version = 0.12.5
3
+ version = 0.12.7
4
4
  author = smeisegeier
5
5
  author_email = dexterDSDo@googlemail.com
6
6
  description = A collection of helper for table handling and visualization
@@ -0,0 +1,76 @@
1
+ import pandas as pd
2
+ import re
3
+
4
+
5
+ def remove_pii(
6
+ series: pd.Series,
7
+ verbose: bool = True,
8
+ logging: bool = False,
9
+ custom_regex="",
10
+ ) -> pd.Index:
11
+ """
12
+ Remove personally identifiable information (PII) from the given column.
13
+
14
+ Parameters:
15
+ - series: A pandas Series representing a column in a DataFrame.
16
+ - verbose: If True, print pii items
17
+ - logging: If True, write pii items into the file .pii.log
18
+ - custom_regex: Regex that is injected into detection
19
+
20
+ Returns:
21
+ - index object with indexes of all pii items
22
+
23
+ Remarks:
24
+ - df.drop(axis=0, index=result, inplace=True)
25
+ """
26
+
27
+ # * reject empty columns
28
+ assert len(series) > 0
29
+
30
+ col = series.copy()
31
+
32
+ # * na must be dropped to ensure processsing
33
+ col.dropna(inplace=True)
34
+
35
+ # * find terms
36
+ _terms = frozenset(["lösch", "herr", "frau", "strasse", "klinik"])
37
+ idx_terms = col[
38
+ col.str.contains(
39
+ "|".join(_terms),
40
+ case=False,
41
+ regex=True,
42
+ )
43
+ ].index
44
+
45
+ # # * optional: search for terms in whole df
46
+ # df.apply(lambda row: row.astype(str).str.contains('test', case=False, regex=True).any(), axis=1)
47
+
48
+ # # * find dates
49
+ ptr_date = r"\d{2}\.\d{2}\.\d{4}"
50
+ idx_date = col[col.str.contains(ptr_date, regex=True)].index
51
+
52
+ # * dr
53
+ ptr_dr = r"[D|d][R|r]\. | Fr\. | Hr\. | PD "
54
+ idx_dr = col[col.str.contains(ptr_dr, regex=True)].index
55
+
56
+ # * custom
57
+ idx_custom = (
58
+ col[col.str.contains(custom_regex, regex=True)].index
59
+ if custom_regex
60
+ else pd.Index([])
61
+ )
62
+
63
+ idx_all = idx_terms.union(idx_date).union(idx_dr).union(idx_custom)
64
+
65
+ if verbose:
66
+ # print(f"found: {idx_dr.__len__()} dr | {idx_date.__len__()} date | {idx_terms.__len__()} terms")
67
+ print(f"found {idx_all.__len__():_} pii items:")
68
+ print(col.loc[idx_all].tolist())
69
+
70
+ if logging: # Assuming logging is defined and has the correct value
71
+ data = col.loc[idx_all] # Assuming col and idx_all are defined
72
+ with open(".pii.log", "w") as f:
73
+ # ! when using str(), it will give only a summary!
74
+ f.write(data.to_string(index=True))
75
+
76
+ return idx_all
@@ -12,6 +12,7 @@ from matplotlib import pyplot as plt
12
12
  from plotly import express as px
13
13
  import plotly.graph_objects as go
14
14
  from plotly.subplots import make_subplots
15
+ import plotly # needed for return types
15
16
 
16
17
  from .hlp import *
17
18
  from .tbl import print_summary
@@ -189,7 +190,7 @@ def plot_stacked_bars(
189
190
  png_path: Path | str = None,
190
191
  color_palette: str = "Plotly",
191
192
  null_label: str = "<NA>",
192
- ) -> object:
193
+ ) -> plotly.graph_objects:
193
194
  """
194
195
  Generates a stacked bar plot using the provided DataFrame.
195
196
 
@@ -220,7 +221,7 @@ def plot_stacked_bars(
220
221
  - A Plotly figure object representing the stacked bar chart.
221
222
  """
222
223
  BAR_LENGTH_MULTIPLIER = 1.05
223
-
224
+
224
225
  # * 2 axis means at least 2 columns
225
226
  if len(df.columns) < 2 or len(df.columns) > 3:
226
227
  print("❌ df must have exactly 2 or 3 columns")
@@ -256,16 +257,6 @@ def plot_stacked_bars(
256
257
  col_index = df.columns[0] if not swap else df.columns[1]
257
258
  col_color = df.columns[1] if not swap else df.columns[0]
258
259
 
259
- # * assign colors to columns
260
- unique_colors = sorted(df[col_color].unique())
261
- column_colors = assign_column_colors(unique_colors, color_palette, null_label)
262
-
263
- # * add total as aggregation of df
264
- if show_total:
265
- df_total = df.copy()
266
- df_total[col_index] = " TOTAL" # add space to make this item first
267
- df = pd.concat([df, df_total])
268
-
269
260
  # * ensure df is grouped to prevent false aggregations
270
261
  df = (
271
262
  df.groupby([df.columns[0], df.columns[1]])
@@ -273,8 +264,32 @@ def plot_stacked_bars(
273
264
  .sum()
274
265
  .reset_index()
275
266
  )
267
+
268
+ # * add total as aggregation of df
269
+ if show_total:
270
+ df_total = df.groupby(df.columns[1], observed=True, as_index=False)[df.columns[2]].sum()
271
+ df_total[df.columns[0]] = " Total"
272
+ df = pd.concat([df, df_total], ignore_index=True)
273
+
274
+
275
+ # * apply top_n, reduce df
276
+ n_col = top_n_color if top_n_color > 0 else None
277
+ n_idx = top_n_index if top_n_index > 0 else None
278
+
279
+ unique_colors = sorted(
280
+ df.groupby(col_color)[df.columns[2]]
281
+ .sum()
282
+ .sort_values(ascending=False)
283
+ .index.tolist()[:n_col]
284
+ )
276
285
 
277
- # * Sorting logic based on sort_values
286
+ unique_idx = df[col_index].sort_values().unique()[:n_idx]
287
+
288
+ df = df[df[col_color].isin(unique_colors)]#.sort_values(by=[col_index, col_color])
289
+ df = df[df[col_index].isin(unique_idx)]#.sort_values(by=[col_index, col_color])
290
+
291
+
292
+ # # * Sorting logic based on sort_values
278
293
  if sort_values:
279
294
  sort_order = (
280
295
  df.groupby(col_index)[df.columns[2]].sum().sort_values(ascending=False).index
@@ -282,9 +297,14 @@ def plot_stacked_bars(
282
297
  else:
283
298
  sort_order = sorted(df[col_index].unique()) # Alphabetical order
284
299
 
285
- # * Convert to categorical with explicit ordering
300
+ # # * Convert to categorical with explicit ordering
286
301
  df[col_index] = pd.Categorical(df[col_index], categories=sort_order, ordered=True)
287
302
 
303
+ column_colors = assign_column_colors(
304
+ columns=unique_colors,
305
+ color_palette=color_palette,
306
+ null_label=null_label
307
+ )
288
308
 
289
309
  # * calculate n
290
310
  divider = 2 if show_total else 1
@@ -297,13 +317,24 @@ def plot_stacked_bars(
297
317
  _title_str_n = f", n={n:_}"
298
318
  caption = _set_caption(caption)
299
319
 
320
+ # * after grouping add cols for pct and formatting
321
+ df["pct"] = df[df.columns[2]].apply(lambda x: f"{(x / n) * 100:.{precision}f}%")
322
+
323
+ # * format output
324
+ df["cnt_str"] = df[df.columns[2]].apply(lambda x: f"{x:_.{precision}f}")
325
+
326
+ divider2 = "<br>" if orientation == "v" else " "
327
+ df["cnt_pct_str"] = df.apply(
328
+ lambda row: f"{row['cnt_str']}{divider2}({row['pct']})", axis=1
329
+ )
330
+
300
331
  # * plot
301
332
  fig = px.bar(
302
333
  df,
303
334
  x=col_index if orientation == "v" else df.columns[2],
304
335
  y=df.columns[2] if orientation == "v" else col_index,
305
336
  color=col_color,
306
- text=df.columns[2],
337
+ text="cnt_pct_str" if normalize else "cnt_str",
307
338
  orientation=orientation,
308
339
  title=title
309
340
  or f"{caption}{_title_str_top_index}[{col_index}] by {_title_str_top_color}[{col_color}]{_title_str_null}{_title_str_n}",
@@ -312,7 +343,9 @@ def plot_stacked_bars(
312
343
  height=height,
313
344
  color_discrete_map=column_colors, # Use assigned colors
314
345
  category_orders={col_index: list(df[col_index].cat.categories)}, # <- Add this line
346
+
315
347
  )
348
+
316
349
  # * get longest bar
317
350
  bar_max = (
318
351
  df.groupby(col_index)[df.columns[2]].sum().sort_values(ascending=False).iloc[0]
@@ -344,13 +377,14 @@ def plot_stacked_bars(
344
377
  if orientation == "h":
345
378
  if relative:
346
379
  fig.update_xaxes(dtick=5)
347
- elif normalize:
348
- fig.update_xaxes(dtick=0.05)
380
+ # bug dticks are ultra dense
381
+ # elif normalize:
382
+ # fig.update_xaxes(dtick=0.05)
349
383
  else:
350
384
  if relative:
351
385
  fig.update_yaxes(dtick=5)
352
- elif normalize:
353
- fig.update_yaxes(dtick=0.05)
386
+ # elif normalize:
387
+ # fig.update_yaxes(dtick=0.05)
354
388
 
355
389
  # * show grids, set to smaller distance on pct scale
356
390
  fig.update_xaxes(showgrid=True, gridwidth=1)
@@ -474,7 +508,10 @@ def plot_bars(
474
508
 
475
509
  # * after grouping add cols for pct and formatting
476
510
  df["pct"] = df[df.columns[1]] / n
511
+
512
+ # * format output
477
513
  df["cnt_str"] = df[df.columns[1]].apply(lambda x: f"{x:_.{precision}f}")
514
+
478
515
  divider = "<br>" if orientation == "v" else " "
479
516
  df["cnt_pct_str"] = df.apply(
480
517
  lambda row: f"{row['cnt_str']}{divider}({row['pct']:.1%})", axis=1
@@ -961,7 +998,8 @@ def plot_box(
961
998
  fig.show("png")
962
999
 
963
1000
  if summary:
964
- print_summary(ser)
1001
+ # * if only series is provided, col name is None
1002
+ print_summary(ser.to_frame())
965
1003
 
966
1004
  # * save to png if path is provided
967
1005
  if png_path is not None:
@@ -976,7 +1014,7 @@ def plot_boxes(
976
1014
  points: Literal["all", "outliers", "suspectedoutliers", None] = None,
977
1015
  precision: int = 2,
978
1016
  height: int = 600,
979
- width: int = 800,
1017
+ width: int = 1200,
980
1018
  annotations: bool = True,
981
1019
  summary: bool = True,
982
1020
  title: str = None,
@@ -1003,7 +1041,7 @@ def plot_boxes(
1003
1041
  if (
1004
1042
  len(df.columns) != 2
1005
1043
  or not (
1006
- (pd.api.types.is_string_dtype(df.iloc[:, 0]))
1044
+ (pd.api.types.is_object_dtype(df.iloc[:, 0]))
1007
1045
  or (pd.api.types.is_bool_dtype(df.iloc[:, 0]))
1008
1046
  )
1009
1047
  or not pd.api.types.is_numeric_dtype(df.iloc[:, 1])
@@ -1109,7 +1147,7 @@ def plot_boxes(
1109
1147
 
1110
1148
  fig.show("png")
1111
1149
  if summary:
1112
- print_summary(df)
1150
+ print_summary(df=df, precision=precision)
1113
1151
 
1114
1152
  # * save to png if path is provided
1115
1153
  if png_path is not None:
@@ -1175,8 +1213,23 @@ def plot_facet_stacked_bars(
1175
1213
 
1176
1214
  aggregated_df = aggregate_data(df, top_n_index, top_n_columns, top_n_facet, null_label)
1177
1215
 
1178
- facets = aggregated_df['facet'].unique()
1179
- columns = sorted(aggregated_df['col'].unique())
1216
+ # facets = aggregated_df['facet'].unique()
1217
+ facets = sorted(aggregated_df['facet'].unique()) # Ensure facets are sorted consistently
1218
+
1219
+ if top_n_columns > 0:
1220
+ top_columns = aggregated_df.groupby('col', observed=True)['value'].sum().nlargest(top_n_columns).index.tolist()
1221
+ # aggregated_df['col'] = aggregated_df['col'].apply(lambda x: x if x in top_columns else "<other>")
1222
+ # aggregated_df['col'] = pd.Categorical(aggregated_df['col'], categories=top_columns + ["<other>"], ordered=True)
1223
+ # aggregated_df['col'] = pd.Categorical(
1224
+ # aggregated_df['col'].map(lambda x: x if x in top_columns else "<other>"),
1225
+ # categories=top_columns + ["<other>"],
1226
+ # ordered=True
1227
+ # )
1228
+ aggregated_df['col'] = aggregated_df['col'].apply(lambda x: x if x in top_columns else "<other>")
1229
+
1230
+
1231
+ # columns = sorted(aggregated_df['col'].unique())
1232
+ columns = aggregated_df.groupby('col', observed=True)['value'].sum().sort_values(ascending=False).index.tolist()
1180
1233
  column_colors = assign_column_colors(columns, color_palette, null_label)
1181
1234
 
1182
1235
  fig = make_subplots(
@@ -15,7 +15,7 @@ from plotly.subplots import make_subplots
15
15
  from scipy import stats
16
16
  import dataframe_image as dfi
17
17
 
18
- from .hlp import wrap_text
18
+ from .hlp import wrap_text, to_series
19
19
 
20
20
  import duckdb as ddb
21
21
 
@@ -696,7 +696,7 @@ def show_num_df(
696
696
 
697
697
 
698
698
 
699
- def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="🟠 "):
699
+ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str=" ", precision: int=3):
700
700
  """
701
701
  Print statistical summary for a pandas DataFrame or Series.
702
702
 
@@ -712,15 +712,44 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
712
712
  df (Union[pd.DataFrame, pd.Series]): Input DataFrame or Series. Only numeric columns
713
713
  in DataFrame are considered.
714
714
  show (bool, optional): Whether to print the summary. Defaults to True.
715
- name (str, optional): Prefix for the summary. Defaults to "🟠 "
715
+ name (str, optional): Prefix for the summary. Defaults to " ".
716
+ precision (int, optional): Number of digits to round the results to. Defaults to 3.
716
717
  """
717
718
  if df.empty:
718
719
  return
719
720
 
720
721
  # * drop NA to keep scipy sane
721
- df = df.dropna().copy()
722
+ df = df.dropna().copy()
722
723
 
723
- def print_summary_ser(ser: pd.Series, show: bool=True, name: str=""):
724
+ # display(df)
725
+
726
+ if len(df.columns) == 1:
727
+ df = df.to_series()
728
+
729
+ pd.api.types.is_numeric_dtype(df)
730
+
731
+
732
+ if not (
733
+ # * series must be numeric
734
+ (isinstance(df, pd.Series)
735
+ and pd.api.types.is_numeric_dtype(df)
736
+ )
737
+ or
738
+ # * df must have 2 columns str num
739
+ (len(df.columns) == 2
740
+ and (
741
+ (pd.api.types.is_object_dtype(df.iloc[:, 0]))
742
+ or (pd.api.types.is_bool_dtype(df.iloc[:, 0]))
743
+ )
744
+ and pd.api.types.is_numeric_dtype(df.iloc[:, 1])
745
+ )
746
+ ):
747
+ print(f"❌ df must have 2 columns: [0] str or bool, [1] num, or be a series")
748
+ return
749
+
750
+
751
+
752
+ def print_summary_ser(ser: pd.Series, show: bool=True, name: str="", precision: int=3):
724
753
  # Calculate IQR and pass `rng=(25, 75)` to get the interquartile range
725
754
  iqr_value = stats.iqr(ser)
726
755
 
@@ -728,21 +757,21 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
728
757
  # ser.dropna(inplace=True)
729
758
 
730
759
  # Using the iqr function, we still calculate the bounds manually
731
- q1 = stats.scoreatpercentile(ser, 25)
732
- q3 = stats.scoreatpercentile(ser, 75)
760
+ q1 = round(stats.scoreatpercentile(ser, 25), precision)
761
+ q3 = round(stats.scoreatpercentile(ser, 75), precision)
733
762
 
734
763
  # Calculate upper bound directly
735
- min = round(ser.min(),3)
736
- med = round(ser.median(),3)
737
- upper = round(q3 + 1.5 * iqr_value,3)
738
- lower = round(q1 - 1.5 * iqr_value,3)
739
- mean = round(ser.mean(),3)
740
- std = round(ser.std(),3)
741
- cv = round(ser.std() / ser.mean(),3)
742
- max = round(ser.max(),3)
743
- sum = round(ser.sum(),3)
744
- skew = round(stats.skew(ser.dropna().tolist()),3)
745
- kurto = round(stats.kurtosis(ser.dropna().tolist()),3)
764
+ min = round(ser.min(), precision)
765
+ med = round(ser.median(), precision)
766
+ upper = round(q3 + 1.5 * iqr_value, precision)
767
+ lower = round(q1 - 1.5 * iqr_value, precision)
768
+ mean = round(ser.mean(), precision)
769
+ std = round(ser.std(), precision)
770
+ cv = round(ser.std() / ser.mean(), precision)
771
+ max = round(ser.max(), precision)
772
+ sum = round(ser.sum(), precision)
773
+ skew = round(stats.skew(ser.dropna().tolist()), precision)
774
+ kurto = round(stats.kurtosis(ser.dropna().tolist()), precision)
746
775
 
747
776
  lower = min if lower < min else lower
748
777
  upper = max if upper > max else upper
@@ -750,7 +779,7 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
750
779
  # * extra care for scipy metrics, these are very vulnarable to nan
751
780
  if show:
752
781
  print(
753
- f"""{name} min: {min:_} | lower: {lower:_} | q25: {q1:_} | median: {med:_} | mean: {mean:_} | q75: {q3:_} | upper: {upper:_} | max: {max:_} | std: {std:_} | cv: {cv:_} | sum: {sum:_} | skew: {skew} | kurto: {kurto}""")
782
+ f"""{name} -> min: {min:_} | lower: {lower:_} | q25: {q1:_} | median: {med:_} | mean: {mean:_} | q75: {q3:_} | upper: {upper:_} | max: {max:_} | std: {std:_} | cv: {cv:_} | sum: {sum:_} | skew: {skew} | kurto: {kurto}""")
754
783
 
755
784
  summary = {
756
785
  "min": min,
@@ -770,11 +799,22 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
770
799
  return summary
771
800
 
772
801
  if isinstance(df, pd.Series):
773
- return print_summary_ser(df, show=show, name=name)
802
+ # * print serie
803
+ name = df.name if df.name else "series"
804
+ print_summary_ser(ser=df, show=show, name=name, precision=precision)
805
+ return
774
806
 
775
807
  if isinstance(df, pd.DataFrame):
776
- # * only show numerics
777
- for col in df.select_dtypes("number").columns:
778
- summary = print_summary_ser(ser=df[col],show=show, name=col)
808
+ # * print for all values
809
+ print(f"🟧 all data")
810
+ name = df.columns[-1]
811
+ summary = print_summary_ser(ser=df.iloc[:,1], show=show, name=name, precision=precision)
812
+
813
+ print(f"🟧 boxes")
814
+ # * print for each value
815
+ for item in df.iloc[:,0].unique():
816
+ # display(df[df.iloc[:,0] == item])
817
+ print_summary_ser(ser=df[df.iloc[:,0] == item].iloc[:,1], show=show, name=item, precision=precision)
779
818
 
780
819
  return summary
820
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: pandas-plots
3
- Version: 0.12.5
3
+ Version: 0.12.7
4
4
  Summary: A collection of helper for table handling and visualization
5
5
  Home-page: https://github.com/smeisegeier/pandas-plots
6
6
  Author: smeisegeier
@@ -10,5 +10,6 @@ src/pandas_plots/ven.py
10
10
  src/pandas_plots.egg-info/PKG-INFO
11
11
  src/pandas_plots.egg-info/SOURCES.txt
12
12
  src/pandas_plots.egg-info/dependency_links.txt
13
+ src/pandas_plots.egg-info/pii.py
13
14
  src/pandas_plots.egg-info/requires.txt
14
15
  src/pandas_plots.egg-info/top_level.txt
File without changes
File without changes