pandas-plots 0.12.5__tar.gz → 0.12.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pandas_plots-0.12.5/src/pandas_plots.egg-info → pandas_plots-0.12.6}/PKG-INFO +1 -1
- {pandas_plots-0.12.5 → pandas_plots-0.12.6}/setup.cfg +1 -1
- pandas_plots-0.12.6/src/pandas_plots/pii.py +76 -0
- {pandas_plots-0.12.5 → pandas_plots-0.12.6}/src/pandas_plots/pls.py +47 -17
- {pandas_plots-0.12.5 → pandas_plots-0.12.6/src/pandas_plots.egg-info}/PKG-INFO +1 -1
- {pandas_plots-0.12.5 → pandas_plots-0.12.6}/src/pandas_plots.egg-info/SOURCES.txt +1 -0
- {pandas_plots-0.12.5 → pandas_plots-0.12.6}/LICENSE +0 -0
- {pandas_plots-0.12.5 → pandas_plots-0.12.6}/README.md +0 -0
- {pandas_plots-0.12.5 → pandas_plots-0.12.6}/pyproject.toml +0 -0
- {pandas_plots-0.12.5 → pandas_plots-0.12.6}/src/pandas_plots/hlp.py +0 -0
- {pandas_plots-0.12.5 → pandas_plots-0.12.6}/src/pandas_plots/tbl.py +0 -0
- {pandas_plots-0.12.5 → pandas_plots-0.12.6}/src/pandas_plots/ven.py +0 -0
- {pandas_plots-0.12.5 → pandas_plots-0.12.6}/src/pandas_plots.egg-info/dependency_links.txt +0 -0
- {pandas_plots-0.12.5/src/pandas_plots → pandas_plots-0.12.6/src/pandas_plots.egg-info}/pii.py +0 -0
- {pandas_plots-0.12.5 → pandas_plots-0.12.6}/src/pandas_plots.egg-info/requires.txt +0 -0
- {pandas_plots-0.12.5 → pandas_plots-0.12.6}/src/pandas_plots.egg-info/top_level.txt +0 -0
@@ -0,0 +1,76 @@
|
|
1
|
+
import pandas as pd
|
2
|
+
import re
|
3
|
+
|
4
|
+
|
5
|
+
def remove_pii(
|
6
|
+
series: pd.Series,
|
7
|
+
verbose: bool = True,
|
8
|
+
logging: bool = False,
|
9
|
+
custom_regex="",
|
10
|
+
) -> pd.Index:
|
11
|
+
"""
|
12
|
+
Remove personally identifiable information (PII) from the given column.
|
13
|
+
|
14
|
+
Parameters:
|
15
|
+
- series: A pandas Series representing a column in a DataFrame.
|
16
|
+
- verbose: If True, print pii items
|
17
|
+
- logging: If True, write pii items into the file .pii.log
|
18
|
+
- custom_regex: Regex that is injected into detection
|
19
|
+
|
20
|
+
Returns:
|
21
|
+
- index object with indexes of all pii items
|
22
|
+
|
23
|
+
Remarks:
|
24
|
+
- df.drop(axis=0, index=result, inplace=True)
|
25
|
+
"""
|
26
|
+
|
27
|
+
# * reject empty columns
|
28
|
+
assert len(series) > 0
|
29
|
+
|
30
|
+
col = series.copy()
|
31
|
+
|
32
|
+
# * na must be dropped to ensure processsing
|
33
|
+
col.dropna(inplace=True)
|
34
|
+
|
35
|
+
# * find terms
|
36
|
+
_terms = frozenset(["lösch", "herr", "frau", "strasse", "klinik"])
|
37
|
+
idx_terms = col[
|
38
|
+
col.str.contains(
|
39
|
+
"|".join(_terms),
|
40
|
+
case=False,
|
41
|
+
regex=True,
|
42
|
+
)
|
43
|
+
].index
|
44
|
+
|
45
|
+
# # * optional: search for terms in whole df
|
46
|
+
# df.apply(lambda row: row.astype(str).str.contains('test', case=False, regex=True).any(), axis=1)
|
47
|
+
|
48
|
+
# # * find dates
|
49
|
+
ptr_date = r"\d{2}\.\d{2}\.\d{4}"
|
50
|
+
idx_date = col[col.str.contains(ptr_date, regex=True)].index
|
51
|
+
|
52
|
+
# * dr
|
53
|
+
ptr_dr = r"[D|d][R|r]\. | Fr\. | Hr\. | PD "
|
54
|
+
idx_dr = col[col.str.contains(ptr_dr, regex=True)].index
|
55
|
+
|
56
|
+
# * custom
|
57
|
+
idx_custom = (
|
58
|
+
col[col.str.contains(custom_regex, regex=True)].index
|
59
|
+
if custom_regex
|
60
|
+
else pd.Index([])
|
61
|
+
)
|
62
|
+
|
63
|
+
idx_all = idx_terms.union(idx_date).union(idx_dr).union(idx_custom)
|
64
|
+
|
65
|
+
if verbose:
|
66
|
+
# print(f"found: {idx_dr.__len__()} dr | {idx_date.__len__()} date | {idx_terms.__len__()} terms")
|
67
|
+
print(f"found {idx_all.__len__():_} pii items:")
|
68
|
+
print(col.loc[idx_all].tolist())
|
69
|
+
|
70
|
+
if logging: # Assuming logging is defined and has the correct value
|
71
|
+
data = col.loc[idx_all] # Assuming col and idx_all are defined
|
72
|
+
with open(".pii.log", "w") as f:
|
73
|
+
# ! when using str(), it will give only a summary!
|
74
|
+
f.write(data.to_string(index=True))
|
75
|
+
|
76
|
+
return idx_all
|
@@ -256,16 +256,6 @@ def plot_stacked_bars(
|
|
256
256
|
col_index = df.columns[0] if not swap else df.columns[1]
|
257
257
|
col_color = df.columns[1] if not swap else df.columns[0]
|
258
258
|
|
259
|
-
# * assign colors to columns
|
260
|
-
unique_colors = sorted(df[col_color].unique())
|
261
|
-
column_colors = assign_column_colors(unique_colors, color_palette, null_label)
|
262
|
-
|
263
|
-
# * add total as aggregation of df
|
264
|
-
if show_total:
|
265
|
-
df_total = df.copy()
|
266
|
-
df_total[col_index] = " TOTAL" # add space to make this item first
|
267
|
-
df = pd.concat([df, df_total])
|
268
|
-
|
269
259
|
# * ensure df is grouped to prevent false aggregations
|
270
260
|
df = (
|
271
261
|
df.groupby([df.columns[0], df.columns[1]])
|
@@ -281,10 +271,31 @@ def plot_stacked_bars(
|
|
281
271
|
)
|
282
272
|
else:
|
283
273
|
sort_order = sorted(df[col_index].unique()) # Alphabetical order
|
274
|
+
df[col_index] = pd.Categorical(df[col_index], categories=sort_order, ordered=True)
|
275
|
+
|
276
|
+
# * add total as aggregation of df
|
277
|
+
if show_total:
|
278
|
+
df_total = df.copy()
|
279
|
+
df_total[col_index] = " TOTAL" # add space to make this item first
|
280
|
+
df = pd.concat([df, df_total])
|
284
281
|
|
285
282
|
# * Convert to categorical with explicit ordering
|
286
283
|
df[col_index] = pd.Categorical(df[col_index], categories=sort_order, ordered=True)
|
287
284
|
|
285
|
+
if top_n_index > 0 and len(sort_order) > top_n_index:
|
286
|
+
top_categories = sort_order[:top_n_index]
|
287
|
+
df[col_index] = df[col_index].apply(lambda x: x if x in top_categories else "<other>")
|
288
|
+
|
289
|
+
unique_colors = sorted(df[col_color].unique())
|
290
|
+
if top_n_color > 0 and len(unique_colors) > top_n_color:
|
291
|
+
top_colors = unique_colors[:top_n_color]
|
292
|
+
df[col_color] = df[col_color].apply(lambda x: x if x in top_colors else "<other>")
|
293
|
+
|
294
|
+
column_colors = assign_column_colors(sorted(df[col_color].unique()), color_palette, null_label)
|
295
|
+
|
296
|
+
# # * assign colors to columns
|
297
|
+
# unique_colors = sorted(df[col_color].unique())
|
298
|
+
# column_colors = assign_column_colors(unique_colors, color_palette, null_label)
|
288
299
|
|
289
300
|
# * calculate n
|
290
301
|
divider = 2 if show_total else 1
|
@@ -312,6 +323,8 @@ def plot_stacked_bars(
|
|
312
323
|
height=height,
|
313
324
|
color_discrete_map=column_colors, # Use assigned colors
|
314
325
|
category_orders={col_index: list(df[col_index].cat.categories)}, # <- Add this line
|
326
|
+
# category_orders={col_index: df[col_index].categories.tolist() if isinstance(df[col_index].dtype, pd.CategoricalDtype) else sorted(df[col_index].unique())}
|
327
|
+
|
315
328
|
)
|
316
329
|
# * get longest bar
|
317
330
|
bar_max = (
|
@@ -344,13 +357,14 @@ def plot_stacked_bars(
|
|
344
357
|
if orientation == "h":
|
345
358
|
if relative:
|
346
359
|
fig.update_xaxes(dtick=5)
|
347
|
-
|
348
|
-
|
360
|
+
# bug dticks are ultra dense
|
361
|
+
# elif normalize:
|
362
|
+
# fig.update_xaxes(dtick=0.05)
|
349
363
|
else:
|
350
364
|
if relative:
|
351
365
|
fig.update_yaxes(dtick=5)
|
352
|
-
elif normalize:
|
353
|
-
|
366
|
+
# elif normalize:
|
367
|
+
# fig.update_yaxes(dtick=0.05)
|
354
368
|
|
355
369
|
# * show grids, set to smaller distance on pct scale
|
356
370
|
fig.update_xaxes(showgrid=True, gridwidth=1)
|
@@ -961,7 +975,8 @@ def plot_box(
|
|
961
975
|
fig.show("png")
|
962
976
|
|
963
977
|
if summary:
|
964
|
-
|
978
|
+
# * if only series is provided, col name is None
|
979
|
+
print_summary(ser.to_frame())
|
965
980
|
|
966
981
|
# * save to png if path is provided
|
967
982
|
if png_path is not None:
|
@@ -1175,8 +1190,23 @@ def plot_facet_stacked_bars(
|
|
1175
1190
|
|
1176
1191
|
aggregated_df = aggregate_data(df, top_n_index, top_n_columns, top_n_facet, null_label)
|
1177
1192
|
|
1178
|
-
facets = aggregated_df['facet'].unique()
|
1179
|
-
|
1193
|
+
# facets = aggregated_df['facet'].unique()
|
1194
|
+
facets = sorted(aggregated_df['facet'].unique()) # Ensure facets are sorted consistently
|
1195
|
+
|
1196
|
+
if top_n_columns > 0:
|
1197
|
+
top_columns = aggregated_df.groupby('col', observed=True)['value'].sum().nlargest(top_n_columns).index.tolist()
|
1198
|
+
# aggregated_df['col'] = aggregated_df['col'].apply(lambda x: x if x in top_columns else "<other>")
|
1199
|
+
# aggregated_df['col'] = pd.Categorical(aggregated_df['col'], categories=top_columns + ["<other>"], ordered=True)
|
1200
|
+
# aggregated_df['col'] = pd.Categorical(
|
1201
|
+
# aggregated_df['col'].map(lambda x: x if x in top_columns else "<other>"),
|
1202
|
+
# categories=top_columns + ["<other>"],
|
1203
|
+
# ordered=True
|
1204
|
+
# )
|
1205
|
+
aggregated_df['col'] = aggregated_df['col'].apply(lambda x: x if x in top_columns else "<other>")
|
1206
|
+
|
1207
|
+
|
1208
|
+
# columns = sorted(aggregated_df['col'].unique())
|
1209
|
+
columns = aggregated_df.groupby('col', observed=True)['value'].sum().sort_values(ascending=False).index.tolist()
|
1180
1210
|
column_colors = assign_column_colors(columns, color_palette, null_label)
|
1181
1211
|
|
1182
1212
|
fig = make_subplots(
|
@@ -10,5 +10,6 @@ src/pandas_plots/ven.py
|
|
10
10
|
src/pandas_plots.egg-info/PKG-INFO
|
11
11
|
src/pandas_plots.egg-info/SOURCES.txt
|
12
12
|
src/pandas_plots.egg-info/dependency_links.txt
|
13
|
+
src/pandas_plots.egg-info/pii.py
|
13
14
|
src/pandas_plots.egg-info/requires.txt
|
14
15
|
src/pandas_plots.egg-info/top_level.txt
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{pandas_plots-0.12.5/src/pandas_plots → pandas_plots-0.12.6/src/pandas_plots.egg-info}/pii.py
RENAMED
File without changes
|
File without changes
|
File without changes
|