pandas-plots 0.12.5__py3-none-any.whl → 0.12.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pandas_plots/pls.py +78 -25
- pandas_plots/tbl.py +63 -23
- {pandas_plots-0.12.5.dist-info → pandas_plots-0.12.7.dist-info}/METADATA +1 -1
- pandas_plots-0.12.7.dist-info/RECORD +11 -0
- pandas_plots-0.12.7.dist-info/pii.py +76 -0
- pandas_plots-0.12.5.dist-info/RECORD +0 -10
- {pandas_plots-0.12.5.dist-info → pandas_plots-0.12.7.dist-info}/LICENSE +0 -0
- {pandas_plots-0.12.5.dist-info → pandas_plots-0.12.7.dist-info}/WHEEL +0 -0
- {pandas_plots-0.12.5.dist-info → pandas_plots-0.12.7.dist-info}/top_level.txt +0 -0
pandas_plots/pls.py
CHANGED
@@ -12,6 +12,7 @@ from matplotlib import pyplot as plt
|
|
12
12
|
from plotly import express as px
|
13
13
|
import plotly.graph_objects as go
|
14
14
|
from plotly.subplots import make_subplots
|
15
|
+
import plotly # needed for return types
|
15
16
|
|
16
17
|
from .hlp import *
|
17
18
|
from .tbl import print_summary
|
@@ -189,7 +190,7 @@ def plot_stacked_bars(
|
|
189
190
|
png_path: Path | str = None,
|
190
191
|
color_palette: str = "Plotly",
|
191
192
|
null_label: str = "<NA>",
|
192
|
-
) ->
|
193
|
+
) -> plotly.graph_objects:
|
193
194
|
"""
|
194
195
|
Generates a stacked bar plot using the provided DataFrame.
|
195
196
|
|
@@ -220,7 +221,7 @@ def plot_stacked_bars(
|
|
220
221
|
- A Plotly figure object representing the stacked bar chart.
|
221
222
|
"""
|
222
223
|
BAR_LENGTH_MULTIPLIER = 1.05
|
223
|
-
|
224
|
+
|
224
225
|
# * 2 axis means at least 2 columns
|
225
226
|
if len(df.columns) < 2 or len(df.columns) > 3:
|
226
227
|
print("❌ df must have exactly 2 or 3 columns")
|
@@ -256,16 +257,6 @@ def plot_stacked_bars(
|
|
256
257
|
col_index = df.columns[0] if not swap else df.columns[1]
|
257
258
|
col_color = df.columns[1] if not swap else df.columns[0]
|
258
259
|
|
259
|
-
# * assign colors to columns
|
260
|
-
unique_colors = sorted(df[col_color].unique())
|
261
|
-
column_colors = assign_column_colors(unique_colors, color_palette, null_label)
|
262
|
-
|
263
|
-
# * add total as aggregation of df
|
264
|
-
if show_total:
|
265
|
-
df_total = df.copy()
|
266
|
-
df_total[col_index] = " TOTAL" # add space to make this item first
|
267
|
-
df = pd.concat([df, df_total])
|
268
|
-
|
269
260
|
# * ensure df is grouped to prevent false aggregations
|
270
261
|
df = (
|
271
262
|
df.groupby([df.columns[0], df.columns[1]])
|
@@ -273,8 +264,32 @@ def plot_stacked_bars(
|
|
273
264
|
.sum()
|
274
265
|
.reset_index()
|
275
266
|
)
|
267
|
+
|
268
|
+
# * add total as aggregation of df
|
269
|
+
if show_total:
|
270
|
+
df_total = df.groupby(df.columns[1], observed=True, as_index=False)[df.columns[2]].sum()
|
271
|
+
df_total[df.columns[0]] = " Total"
|
272
|
+
df = pd.concat([df, df_total], ignore_index=True)
|
273
|
+
|
274
|
+
|
275
|
+
# * apply top_n, reduce df
|
276
|
+
n_col = top_n_color if top_n_color > 0 else None
|
277
|
+
n_idx = top_n_index if top_n_index > 0 else None
|
278
|
+
|
279
|
+
unique_colors = sorted(
|
280
|
+
df.groupby(col_color)[df.columns[2]]
|
281
|
+
.sum()
|
282
|
+
.sort_values(ascending=False)
|
283
|
+
.index.tolist()[:n_col]
|
284
|
+
)
|
276
285
|
|
277
|
-
|
286
|
+
unique_idx = df[col_index].sort_values().unique()[:n_idx]
|
287
|
+
|
288
|
+
df = df[df[col_color].isin(unique_colors)]#.sort_values(by=[col_index, col_color])
|
289
|
+
df = df[df[col_index].isin(unique_idx)]#.sort_values(by=[col_index, col_color])
|
290
|
+
|
291
|
+
|
292
|
+
# # * Sorting logic based on sort_values
|
278
293
|
if sort_values:
|
279
294
|
sort_order = (
|
280
295
|
df.groupby(col_index)[df.columns[2]].sum().sort_values(ascending=False).index
|
@@ -282,9 +297,14 @@ def plot_stacked_bars(
|
|
282
297
|
else:
|
283
298
|
sort_order = sorted(df[col_index].unique()) # Alphabetical order
|
284
299
|
|
285
|
-
# * Convert to categorical with explicit ordering
|
300
|
+
# # * Convert to categorical with explicit ordering
|
286
301
|
df[col_index] = pd.Categorical(df[col_index], categories=sort_order, ordered=True)
|
287
302
|
|
303
|
+
column_colors = assign_column_colors(
|
304
|
+
columns=unique_colors,
|
305
|
+
color_palette=color_palette,
|
306
|
+
null_label=null_label
|
307
|
+
)
|
288
308
|
|
289
309
|
# * calculate n
|
290
310
|
divider = 2 if show_total else 1
|
@@ -297,13 +317,24 @@ def plot_stacked_bars(
|
|
297
317
|
_title_str_n = f", n={n:_}"
|
298
318
|
caption = _set_caption(caption)
|
299
319
|
|
320
|
+
# * after grouping add cols for pct and formatting
|
321
|
+
df["pct"] = df[df.columns[2]].apply(lambda x: f"{(x / n) * 100:.{precision}f}%")
|
322
|
+
|
323
|
+
# * format output
|
324
|
+
df["cnt_str"] = df[df.columns[2]].apply(lambda x: f"{x:_.{precision}f}")
|
325
|
+
|
326
|
+
divider2 = "<br>" if orientation == "v" else " "
|
327
|
+
df["cnt_pct_str"] = df.apply(
|
328
|
+
lambda row: f"{row['cnt_str']}{divider2}({row['pct']})", axis=1
|
329
|
+
)
|
330
|
+
|
300
331
|
# * plot
|
301
332
|
fig = px.bar(
|
302
333
|
df,
|
303
334
|
x=col_index if orientation == "v" else df.columns[2],
|
304
335
|
y=df.columns[2] if orientation == "v" else col_index,
|
305
336
|
color=col_color,
|
306
|
-
text=
|
337
|
+
text="cnt_pct_str" if normalize else "cnt_str",
|
307
338
|
orientation=orientation,
|
308
339
|
title=title
|
309
340
|
or f"{caption}{_title_str_top_index}[{col_index}] by {_title_str_top_color}[{col_color}]{_title_str_null}{_title_str_n}",
|
@@ -312,7 +343,9 @@ def plot_stacked_bars(
|
|
312
343
|
height=height,
|
313
344
|
color_discrete_map=column_colors, # Use assigned colors
|
314
345
|
category_orders={col_index: list(df[col_index].cat.categories)}, # <- Add this line
|
346
|
+
|
315
347
|
)
|
348
|
+
|
316
349
|
# * get longest bar
|
317
350
|
bar_max = (
|
318
351
|
df.groupby(col_index)[df.columns[2]].sum().sort_values(ascending=False).iloc[0]
|
@@ -344,13 +377,14 @@ def plot_stacked_bars(
|
|
344
377
|
if orientation == "h":
|
345
378
|
if relative:
|
346
379
|
fig.update_xaxes(dtick=5)
|
347
|
-
|
348
|
-
|
380
|
+
# bug dticks are ultra dense
|
381
|
+
# elif normalize:
|
382
|
+
# fig.update_xaxes(dtick=0.05)
|
349
383
|
else:
|
350
384
|
if relative:
|
351
385
|
fig.update_yaxes(dtick=5)
|
352
|
-
elif normalize:
|
353
|
-
|
386
|
+
# elif normalize:
|
387
|
+
# fig.update_yaxes(dtick=0.05)
|
354
388
|
|
355
389
|
# * show grids, set to smaller distance on pct scale
|
356
390
|
fig.update_xaxes(showgrid=True, gridwidth=1)
|
@@ -474,7 +508,10 @@ def plot_bars(
|
|
474
508
|
|
475
509
|
# * after grouping add cols for pct and formatting
|
476
510
|
df["pct"] = df[df.columns[1]] / n
|
511
|
+
|
512
|
+
# * format output
|
477
513
|
df["cnt_str"] = df[df.columns[1]].apply(lambda x: f"{x:_.{precision}f}")
|
514
|
+
|
478
515
|
divider = "<br>" if orientation == "v" else " "
|
479
516
|
df["cnt_pct_str"] = df.apply(
|
480
517
|
lambda row: f"{row['cnt_str']}{divider}({row['pct']:.1%})", axis=1
|
@@ -961,7 +998,8 @@ def plot_box(
|
|
961
998
|
fig.show("png")
|
962
999
|
|
963
1000
|
if summary:
|
964
|
-
|
1001
|
+
# * if only series is provided, col name is None
|
1002
|
+
print_summary(ser.to_frame())
|
965
1003
|
|
966
1004
|
# * save to png if path is provided
|
967
1005
|
if png_path is not None:
|
@@ -976,7 +1014,7 @@ def plot_boxes(
|
|
976
1014
|
points: Literal["all", "outliers", "suspectedoutliers", None] = None,
|
977
1015
|
precision: int = 2,
|
978
1016
|
height: int = 600,
|
979
|
-
width: int =
|
1017
|
+
width: int = 1200,
|
980
1018
|
annotations: bool = True,
|
981
1019
|
summary: bool = True,
|
982
1020
|
title: str = None,
|
@@ -1003,7 +1041,7 @@ def plot_boxes(
|
|
1003
1041
|
if (
|
1004
1042
|
len(df.columns) != 2
|
1005
1043
|
or not (
|
1006
|
-
(pd.api.types.
|
1044
|
+
(pd.api.types.is_object_dtype(df.iloc[:, 0]))
|
1007
1045
|
or (pd.api.types.is_bool_dtype(df.iloc[:, 0]))
|
1008
1046
|
)
|
1009
1047
|
or not pd.api.types.is_numeric_dtype(df.iloc[:, 1])
|
@@ -1109,7 +1147,7 @@ def plot_boxes(
|
|
1109
1147
|
|
1110
1148
|
fig.show("png")
|
1111
1149
|
if summary:
|
1112
|
-
print_summary(df)
|
1150
|
+
print_summary(df=df, precision=precision)
|
1113
1151
|
|
1114
1152
|
# * save to png if path is provided
|
1115
1153
|
if png_path is not None:
|
@@ -1175,8 +1213,23 @@ def plot_facet_stacked_bars(
|
|
1175
1213
|
|
1176
1214
|
aggregated_df = aggregate_data(df, top_n_index, top_n_columns, top_n_facet, null_label)
|
1177
1215
|
|
1178
|
-
facets = aggregated_df['facet'].unique()
|
1179
|
-
|
1216
|
+
# facets = aggregated_df['facet'].unique()
|
1217
|
+
facets = sorted(aggregated_df['facet'].unique()) # Ensure facets are sorted consistently
|
1218
|
+
|
1219
|
+
if top_n_columns > 0:
|
1220
|
+
top_columns = aggregated_df.groupby('col', observed=True)['value'].sum().nlargest(top_n_columns).index.tolist()
|
1221
|
+
# aggregated_df['col'] = aggregated_df['col'].apply(lambda x: x if x in top_columns else "<other>")
|
1222
|
+
# aggregated_df['col'] = pd.Categorical(aggregated_df['col'], categories=top_columns + ["<other>"], ordered=True)
|
1223
|
+
# aggregated_df['col'] = pd.Categorical(
|
1224
|
+
# aggregated_df['col'].map(lambda x: x if x in top_columns else "<other>"),
|
1225
|
+
# categories=top_columns + ["<other>"],
|
1226
|
+
# ordered=True
|
1227
|
+
# )
|
1228
|
+
aggregated_df['col'] = aggregated_df['col'].apply(lambda x: x if x in top_columns else "<other>")
|
1229
|
+
|
1230
|
+
|
1231
|
+
# columns = sorted(aggregated_df['col'].unique())
|
1232
|
+
columns = aggregated_df.groupby('col', observed=True)['value'].sum().sort_values(ascending=False).index.tolist()
|
1180
1233
|
column_colors = assign_column_colors(columns, color_palette, null_label)
|
1181
1234
|
|
1182
1235
|
fig = make_subplots(
|
pandas_plots/tbl.py
CHANGED
@@ -15,7 +15,7 @@ from plotly.subplots import make_subplots
|
|
15
15
|
from scipy import stats
|
16
16
|
import dataframe_image as dfi
|
17
17
|
|
18
|
-
from .hlp import wrap_text
|
18
|
+
from .hlp import wrap_text, to_series
|
19
19
|
|
20
20
|
import duckdb as ddb
|
21
21
|
|
@@ -696,7 +696,7 @@ def show_num_df(
|
|
696
696
|
|
697
697
|
|
698
698
|
|
699
|
-
def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
|
699
|
+
def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str=" ", precision: int=3):
|
700
700
|
"""
|
701
701
|
Print statistical summary for a pandas DataFrame or Series.
|
702
702
|
|
@@ -712,15 +712,44 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
|
|
712
712
|
df (Union[pd.DataFrame, pd.Series]): Input DataFrame or Series. Only numeric columns
|
713
713
|
in DataFrame are considered.
|
714
714
|
show (bool, optional): Whether to print the summary. Defaults to True.
|
715
|
-
name (str, optional): Prefix for the summary. Defaults to "
|
715
|
+
name (str, optional): Prefix for the summary. Defaults to " ".
|
716
|
+
precision (int, optional): Number of digits to round the results to. Defaults to 3.
|
716
717
|
"""
|
717
718
|
if df.empty:
|
718
719
|
return
|
719
720
|
|
720
721
|
# * drop NA to keep scipy sane
|
721
|
-
df = df.dropna().copy()
|
722
|
+
df = df.dropna().copy()
|
722
723
|
|
723
|
-
|
724
|
+
# display(df)
|
725
|
+
|
726
|
+
if len(df.columns) == 1:
|
727
|
+
df = df.to_series()
|
728
|
+
|
729
|
+
pd.api.types.is_numeric_dtype(df)
|
730
|
+
|
731
|
+
|
732
|
+
if not (
|
733
|
+
# * series must be numeric
|
734
|
+
(isinstance(df, pd.Series)
|
735
|
+
and pd.api.types.is_numeric_dtype(df)
|
736
|
+
)
|
737
|
+
or
|
738
|
+
# * df must have 2 columns str num
|
739
|
+
(len(df.columns) == 2
|
740
|
+
and (
|
741
|
+
(pd.api.types.is_object_dtype(df.iloc[:, 0]))
|
742
|
+
or (pd.api.types.is_bool_dtype(df.iloc[:, 0]))
|
743
|
+
)
|
744
|
+
and pd.api.types.is_numeric_dtype(df.iloc[:, 1])
|
745
|
+
)
|
746
|
+
):
|
747
|
+
print(f"❌ df must have 2 columns: [0] str or bool, [1] num, or be a series")
|
748
|
+
return
|
749
|
+
|
750
|
+
|
751
|
+
|
752
|
+
def print_summary_ser(ser: pd.Series, show: bool=True, name: str="", precision: int=3):
|
724
753
|
# Calculate IQR and pass `rng=(25, 75)` to get the interquartile range
|
725
754
|
iqr_value = stats.iqr(ser)
|
726
755
|
|
@@ -728,21 +757,21 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
|
|
728
757
|
# ser.dropna(inplace=True)
|
729
758
|
|
730
759
|
# Using the iqr function, we still calculate the bounds manually
|
731
|
-
q1 = stats.scoreatpercentile(ser, 25)
|
732
|
-
q3 = stats.scoreatpercentile(ser, 75)
|
760
|
+
q1 = round(stats.scoreatpercentile(ser, 25), precision)
|
761
|
+
q3 = round(stats.scoreatpercentile(ser, 75), precision)
|
733
762
|
|
734
763
|
# Calculate upper bound directly
|
735
|
-
min = round(ser.min(),
|
736
|
-
med = round(ser.median(),
|
737
|
-
upper = round(q3 + 1.5 * iqr_value,
|
738
|
-
lower = round(q1 - 1.5 * iqr_value,
|
739
|
-
mean = round(ser.mean(),
|
740
|
-
std = round(ser.std(),
|
741
|
-
cv = round(ser.std() / ser.mean(),
|
742
|
-
max = round(ser.max(),
|
743
|
-
sum = round(ser.sum(),
|
744
|
-
skew = round(stats.skew(ser.dropna().tolist()),
|
745
|
-
kurto = round(stats.kurtosis(ser.dropna().tolist()),
|
764
|
+
min = round(ser.min(), precision)
|
765
|
+
med = round(ser.median(), precision)
|
766
|
+
upper = round(q3 + 1.5 * iqr_value, precision)
|
767
|
+
lower = round(q1 - 1.5 * iqr_value, precision)
|
768
|
+
mean = round(ser.mean(), precision)
|
769
|
+
std = round(ser.std(), precision)
|
770
|
+
cv = round(ser.std() / ser.mean(), precision)
|
771
|
+
max = round(ser.max(), precision)
|
772
|
+
sum = round(ser.sum(), precision)
|
773
|
+
skew = round(stats.skew(ser.dropna().tolist()), precision)
|
774
|
+
kurto = round(stats.kurtosis(ser.dropna().tolist()), precision)
|
746
775
|
|
747
776
|
lower = min if lower < min else lower
|
748
777
|
upper = max if upper > max else upper
|
@@ -750,7 +779,7 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
|
|
750
779
|
# * extra care for scipy metrics, these are very vulnarable to nan
|
751
780
|
if show:
|
752
781
|
print(
|
753
|
-
f"""{name} min: {min:_} | lower: {lower:_} | q25: {q1:_} | median: {med:_} | mean: {mean:_} | q75: {q3:_} | upper: {upper:_} | max: {max:_} | std: {std:_} | cv: {cv:_} | sum: {sum:_} | skew: {skew} | kurto: {kurto}""")
|
782
|
+
f"""{name} -> min: {min:_} | lower: {lower:_} | q25: {q1:_} | median: {med:_} | mean: {mean:_} | q75: {q3:_} | upper: {upper:_} | max: {max:_} | std: {std:_} | cv: {cv:_} | sum: {sum:_} | skew: {skew} | kurto: {kurto}""")
|
754
783
|
|
755
784
|
summary = {
|
756
785
|
"min": min,
|
@@ -770,11 +799,22 @@ def print_summary(df: pd.DataFrame | pd.Series, show: bool = True, name: str="
|
|
770
799
|
return summary
|
771
800
|
|
772
801
|
if isinstance(df, pd.Series):
|
773
|
-
|
802
|
+
# * print serie
|
803
|
+
name = df.name if df.name else "series"
|
804
|
+
print_summary_ser(ser=df, show=show, name=name, precision=precision)
|
805
|
+
return
|
774
806
|
|
775
807
|
if isinstance(df, pd.DataFrame):
|
776
|
-
# *
|
777
|
-
|
778
|
-
|
808
|
+
# * print for all values
|
809
|
+
print(f"🟧 all data")
|
810
|
+
name = df.columns[-1]
|
811
|
+
summary = print_summary_ser(ser=df.iloc[:,1], show=show, name=name, precision=precision)
|
812
|
+
|
813
|
+
print(f"🟧 boxes")
|
814
|
+
# * print for each value
|
815
|
+
for item in df.iloc[:,0].unique():
|
816
|
+
# display(df[df.iloc[:,0] == item])
|
817
|
+
print_summary_ser(ser=df[df.iloc[:,0] == item].iloc[:,1], show=show, name=item, precision=precision)
|
779
818
|
|
780
819
|
return summary
|
820
|
+
|
@@ -0,0 +1,11 @@
|
|
1
|
+
pandas_plots/hlp.py,sha256=N6NrbFagVMMX-ZnV0rIBEz82SeSoOkksfMcCap55W7E,16588
|
2
|
+
pandas_plots/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
|
3
|
+
pandas_plots/pls.py,sha256=WpQ8hPmp8MbHvgEvSejDYFXyY_hZabLY4OLW8S6u15g,44310
|
4
|
+
pandas_plots/tbl.py,sha256=tuTDRFaD4lKQ2fMeMCJwnJL65zXuUGVQ6uwQNVa0y6Q,31883
|
5
|
+
pandas_plots/ven.py,sha256=2x3ACo2vSfO3q6fv-UdDQ0h1SJyt8WChBGgE5SDCdCk,11673
|
6
|
+
pandas_plots-0.12.7.dist-info/LICENSE,sha256=6KQ5KVAAhRaB-JJKpX4cefKvRZRgI7GUPc92_2d31XY,1051
|
7
|
+
pandas_plots-0.12.7.dist-info/METADATA,sha256=9i_TsDQinaUPz9eqJO7a0L4JFZmmM3l_WzoPbmDPH0Y,7358
|
8
|
+
pandas_plots-0.12.7.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
9
|
+
pandas_plots-0.12.7.dist-info/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
|
10
|
+
pandas_plots-0.12.7.dist-info/top_level.txt,sha256=XnaNuIHBqMmCeh_U7nKOYTwFue_SIA0wxuDgdPmnnSk,13
|
11
|
+
pandas_plots-0.12.7.dist-info/RECORD,,
|
@@ -0,0 +1,76 @@
|
|
1
|
+
import pandas as pd
|
2
|
+
import re
|
3
|
+
|
4
|
+
|
5
|
+
def remove_pii(
|
6
|
+
series: pd.Series,
|
7
|
+
verbose: bool = True,
|
8
|
+
logging: bool = False,
|
9
|
+
custom_regex="",
|
10
|
+
) -> pd.Index:
|
11
|
+
"""
|
12
|
+
Remove personally identifiable information (PII) from the given column.
|
13
|
+
|
14
|
+
Parameters:
|
15
|
+
- series: A pandas Series representing a column in a DataFrame.
|
16
|
+
- verbose: If True, print pii items
|
17
|
+
- logging: If True, write pii items into the file .pii.log
|
18
|
+
- custom_regex: Regex that is injected into detection
|
19
|
+
|
20
|
+
Returns:
|
21
|
+
- index object with indexes of all pii items
|
22
|
+
|
23
|
+
Remarks:
|
24
|
+
- df.drop(axis=0, index=result, inplace=True)
|
25
|
+
"""
|
26
|
+
|
27
|
+
# * reject empty columns
|
28
|
+
assert len(series) > 0
|
29
|
+
|
30
|
+
col = series.copy()
|
31
|
+
|
32
|
+
# * na must be dropped to ensure processsing
|
33
|
+
col.dropna(inplace=True)
|
34
|
+
|
35
|
+
# * find terms
|
36
|
+
_terms = frozenset(["lösch", "herr", "frau", "strasse", "klinik"])
|
37
|
+
idx_terms = col[
|
38
|
+
col.str.contains(
|
39
|
+
"|".join(_terms),
|
40
|
+
case=False,
|
41
|
+
regex=True,
|
42
|
+
)
|
43
|
+
].index
|
44
|
+
|
45
|
+
# # * optional: search for terms in whole df
|
46
|
+
# df.apply(lambda row: row.astype(str).str.contains('test', case=False, regex=True).any(), axis=1)
|
47
|
+
|
48
|
+
# # * find dates
|
49
|
+
ptr_date = r"\d{2}\.\d{2}\.\d{4}"
|
50
|
+
idx_date = col[col.str.contains(ptr_date, regex=True)].index
|
51
|
+
|
52
|
+
# * dr
|
53
|
+
ptr_dr = r"[D|d][R|r]\. | Fr\. | Hr\. | PD "
|
54
|
+
idx_dr = col[col.str.contains(ptr_dr, regex=True)].index
|
55
|
+
|
56
|
+
# * custom
|
57
|
+
idx_custom = (
|
58
|
+
col[col.str.contains(custom_regex, regex=True)].index
|
59
|
+
if custom_regex
|
60
|
+
else pd.Index([])
|
61
|
+
)
|
62
|
+
|
63
|
+
idx_all = idx_terms.union(idx_date).union(idx_dr).union(idx_custom)
|
64
|
+
|
65
|
+
if verbose:
|
66
|
+
# print(f"found: {idx_dr.__len__()} dr | {idx_date.__len__()} date | {idx_terms.__len__()} terms")
|
67
|
+
print(f"found {idx_all.__len__():_} pii items:")
|
68
|
+
print(col.loc[idx_all].tolist())
|
69
|
+
|
70
|
+
if logging: # Assuming logging is defined and has the correct value
|
71
|
+
data = col.loc[idx_all] # Assuming col and idx_all are defined
|
72
|
+
with open(".pii.log", "w") as f:
|
73
|
+
# ! when using str(), it will give only a summary!
|
74
|
+
f.write(data.to_string(index=True))
|
75
|
+
|
76
|
+
return idx_all
|
@@ -1,10 +0,0 @@
|
|
1
|
-
pandas_plots/hlp.py,sha256=N6NrbFagVMMX-ZnV0rIBEz82SeSoOkksfMcCap55W7E,16588
|
2
|
-
pandas_plots/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
|
3
|
-
pandas_plots/pls.py,sha256=L10KeIvN1sNWtC6wK5IehinIQzcDVDrgx3DHxTy3cnU,42136
|
4
|
-
pandas_plots/tbl.py,sha256=4VvjLisPT1gSvgsLClcrhC7LIJ-_FPNla8nomGflGag,30509
|
5
|
-
pandas_plots/ven.py,sha256=2x3ACo2vSfO3q6fv-UdDQ0h1SJyt8WChBGgE5SDCdCk,11673
|
6
|
-
pandas_plots-0.12.5.dist-info/LICENSE,sha256=6KQ5KVAAhRaB-JJKpX4cefKvRZRgI7GUPc92_2d31XY,1051
|
7
|
-
pandas_plots-0.12.5.dist-info/METADATA,sha256=_rqHgKQ3vHsRFb57i0i5ZYG3SugslGUeVudP0pLtl2w,7358
|
8
|
-
pandas_plots-0.12.5.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
9
|
-
pandas_plots-0.12.5.dist-info/top_level.txt,sha256=XnaNuIHBqMmCeh_U7nKOYTwFue_SIA0wxuDgdPmnnSk,13
|
10
|
-
pandas_plots-0.12.5.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|