pandas-plots 0.12.24__py3-none-any.whl → 0.12.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pandas_plots/hlp.py +28 -23
- pandas_plots/pls.py +15 -11
- {pandas_plots-0.12.24.dist-info → pandas_plots-0.12.26.dist-info}/METADATA +2 -5
- pandas_plots-0.12.26.dist-info/RECORD +10 -0
- {pandas_plots-0.12.24.dist-info → pandas_plots-0.12.26.dist-info}/WHEEL +1 -1
- pandas_plots/pii.py +0 -76
- pandas_plots-0.12.24.dist-info/RECORD +0 -11
- {pandas_plots-0.12.24.dist-info → pandas_plots-0.12.26.dist-info}/licenses/LICENSE +0 -0
- {pandas_plots-0.12.24.dist-info → pandas_plots-0.12.26.dist-info}/pii.py +0 -0
- {pandas_plots-0.12.24.dist-info → pandas_plots-0.12.26.dist-info}/top_level.txt +0 -0
pandas_plots/hlp.py
CHANGED
@@ -21,44 +21,49 @@ from PIL import Image
|
|
21
21
|
|
22
22
|
URL_REGEX = r"^(?:http|ftp)s?://" # https://stackoverflow.com/a/1617386
|
23
23
|
|
24
|
-
|
25
|
-
def mean_confidence_interval(df, confidence=0.95, use_median=False):
|
24
|
+
def mean_confidence_interval(data, confidence=0.95, use_median=False, n_bootstraps=1000):
|
26
25
|
"""
|
27
|
-
Calculate the mean or median and confidence interval
|
28
|
-
|
26
|
+
Calculate the mean or median and confidence interval.
|
27
|
+
For median, uses bootstrapping for a more robust confidence interval.
|
29
28
|
|
30
29
|
Parameters:
|
31
|
-
|
30
|
+
data (array-like): The input data.
|
32
31
|
confidence (float, optional): The confidence level for the interval. Defaults to 0.95.
|
33
|
-
use_median (bool, optional): If True, calculates median and confidence interval
|
32
|
+
use_median (bool, optional): If True, calculates median and its confidence interval. Defaults to False.
|
33
|
+
n_bootstraps (int, optional): Number of bootstrap samples for median CI. Only used if use_median is True.
|
34
34
|
|
35
35
|
Returns:
|
36
|
-
tuple: A tuple containing the central value (mean or median),
|
36
|
+
tuple: A tuple containing the central value (mean or median), margin of error, lower bound, and upper bound.
|
37
37
|
"""
|
38
|
-
|
39
|
-
if
|
40
|
-
return
|
41
|
-
a = 1.0 * np.array(
|
38
|
+
data = to_series(data)
|
39
|
+
if data is None or len(data) == 0:
|
40
|
+
return np.nan, np.nan, np.nan, np.nan
|
41
|
+
a = 1.0 * np.array(data)
|
42
42
|
n = len(a)
|
43
43
|
|
44
44
|
if use_median:
|
45
|
+
if n < 2: # Cannot bootstrap with n < 2
|
46
|
+
return np.median(a), np.nan, np.nan, np.nan
|
47
|
+
|
48
|
+
bootstrapped_medians = []
|
49
|
+
for _ in range(n_bootstraps):
|
50
|
+
sample = np.random.choice(a, size=n, replace=True)
|
51
|
+
bootstrapped_medians.append(np.median(sample))
|
52
|
+
|
45
53
|
median = np.median(a)
|
46
|
-
|
47
|
-
|
48
|
-
|
54
|
+
alpha = (1 - confidence) / 2
|
55
|
+
lower_bound = np.percentile(bootstrapped_medians, alpha * 100)
|
56
|
+
upper_bound = np.percentile(bootstrapped_medians, (1 - alpha) * 100)
|
57
|
+
margin = (upper_bound - lower_bound) / 2 # Simple approximation for margin based on interval width
|
58
|
+
return median, margin, lower_bound, upper_bound
|
49
59
|
else:
|
50
|
-
mean
|
60
|
+
mean = np.mean(a)
|
61
|
+
if n <= 1:
|
62
|
+
return mean, np.nan, np.nan, np.nan
|
63
|
+
se = scipy.stats.sem(a)
|
51
64
|
margin = se * scipy.stats.t.ppf((1 + confidence) / 2.0, n - 1)
|
52
65
|
return mean, margin, mean - margin, mean + margin
|
53
66
|
|
54
|
-
# # * Alternative
|
55
|
-
# # from statistics import NormalDist
|
56
|
-
# def confidence_interval(data, confidence=0.95):
|
57
|
-
# dist = NormalDist.from_samples(data)
|
58
|
-
# z = NormalDist().inv_cdf((1 + confidence) / 2.)
|
59
|
-
# h = dist.stdev * z / ((len(data) - 1) ** .5)
|
60
|
-
# return dist.mean - h, dist.mean + h
|
61
|
-
|
62
67
|
|
63
68
|
def to_series(df) -> pd.Series | None:
|
64
69
|
"""
|
pandas_plots/pls.py
CHANGED
@@ -563,9 +563,10 @@ def plot_bars(
|
|
563
563
|
|
564
564
|
# * ensure df is grouped to prevent false aggregations, reset index to return df
|
565
565
|
if use_ci:
|
566
|
-
|
567
|
-
df = (
|
568
|
-
|
566
|
+
# * grouping is smoother on df than on series
|
567
|
+
df = (df_in
|
568
|
+
# ? dont dropna() here, this biases the input data
|
569
|
+
.groupby(
|
569
570
|
col_index,
|
570
571
|
dropna=False,
|
571
572
|
)
|
@@ -576,10 +577,11 @@ def plot_bars(
|
|
576
577
|
)
|
577
578
|
.reset_index()
|
578
579
|
)
|
579
|
-
# * enforce vertical bars when using ci
|
580
|
+
# * enforce vertical bars **when using ci**, normalize=False, dropna=True, set empty margin to 0 to avoid dropping the bar
|
580
581
|
orientation = "v"
|
581
582
|
normalize = False
|
582
583
|
dropna = True
|
584
|
+
df.margin.fillna(0, inplace=True)
|
583
585
|
else:
|
584
586
|
df = df_in.groupby(col_index, dropna=dropna)[col_name].sum().reset_index()
|
585
587
|
|
@@ -591,6 +593,7 @@ def plot_bars(
|
|
591
593
|
else:
|
592
594
|
df = df.fillna("<NA>")
|
593
595
|
|
596
|
+
|
594
597
|
# * get n, col1 now is always numeric
|
595
598
|
n = df[df.columns[1]].sum()
|
596
599
|
n_len = len(df_in)
|
@@ -1116,6 +1119,8 @@ def plot_box(
|
|
1116
1119
|
return fig
|
1117
1120
|
|
1118
1121
|
|
1122
|
+
|
1123
|
+
|
1119
1124
|
def plot_boxes(
|
1120
1125
|
df: pd.DataFrame,
|
1121
1126
|
caption: str = None,
|
@@ -1164,14 +1169,9 @@ def plot_boxes(
|
|
1164
1169
|
xlvl2 = 0
|
1165
1170
|
xlvl3 = 50
|
1166
1171
|
|
1167
|
-
# * not working
|
1168
|
-
# yspan_seg = (df.iloc[:, 1].max() - df.iloc[:, 1].max()) * .05
|
1169
|
-
# ylvl1 = -yspan_seg
|
1170
|
-
# ylvl2 = 0
|
1171
|
-
# ylvl3 = yspan_seg
|
1172
|
-
|
1173
1172
|
# * unique items
|
1174
|
-
|
1173
|
+
# Sort the unique items alphabetically
|
1174
|
+
items = sorted(df.iloc[:, 0].unique())
|
1175
1175
|
|
1176
1176
|
caption = _set_caption(caption)
|
1177
1177
|
log_str = " (log-scale)" if use_log else ""
|
@@ -1196,6 +1196,9 @@ def plot_boxes(
|
|
1196
1196
|
),
|
1197
1197
|
)
|
1198
1198
|
|
1199
|
+
# * Set the order of the x-axis categories
|
1200
|
+
fig.update_xaxes(categoryorder="array", categoryarray=items)
|
1201
|
+
|
1199
1202
|
# * yshift is trivial
|
1200
1203
|
YS = 0
|
1201
1204
|
|
@@ -1273,6 +1276,7 @@ def plot_boxes(
|
|
1273
1276
|
|
1274
1277
|
return fig
|
1275
1278
|
|
1279
|
+
|
1276
1280
|
def plot_facet_stacked_bars(
|
1277
1281
|
df: pd.DataFrame,
|
1278
1282
|
subplots_per_row: int = 4,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: pandas-plots
|
3
|
-
Version: 0.12.
|
3
|
+
Version: 0.12.26
|
4
4
|
Summary: A collection of helper for table handling and visualization
|
5
5
|
Home-page: https://github.com/smeisegeier/pandas-plots
|
6
6
|
Author: smeisegeier
|
@@ -49,7 +49,7 @@ pip install pandas-plots -U
|
|
49
49
|
include in python
|
50
50
|
|
51
51
|
```python
|
52
|
-
from pandas_plots import tbl, pls, ven, hlp
|
52
|
+
from pandas_plots import tbl, pls, ven, hlp
|
53
53
|
```
|
54
54
|
|
55
55
|
## example
|
@@ -119,9 +119,6 @@ tbl.show_num_df(
|
|
119
119
|
- `add_measures_to_pyg_config()` adds measures to a pygwalker config file to avoid frequent manual update
|
120
120
|
<br>
|
121
121
|
|
122
|
-
- `pii` has routines for handling of personally identifiable information
|
123
|
-
- `remove_pii()` logs and deletes pii from a series
|
124
|
-
|
125
122
|
> note: theme setting can be controlled through all functions by setting the environment variable `THEME` to either light or dark
|
126
123
|
|
127
124
|
## more examples
|
@@ -0,0 +1,10 @@
|
|
1
|
+
pandas_plots/hlp.py,sha256=z8rrVNbH9qMohdXPT-FksP-VkTOjI0bGFj47Sw5p3aY,21141
|
2
|
+
pandas_plots/pls.py,sha256=wFRQurKtG1GrZK8Z7eHfi7OwrbfM9Xitk_7PolWc_Dk,48823
|
3
|
+
pandas_plots/tbl.py,sha256=RJWBHeKGTAhGpVCY57TsS_dYR-FpInP-TOsKW_tU4V4,32556
|
4
|
+
pandas_plots/ven.py,sha256=2x3ACo2vSfO3q6fv-UdDQ0h1SJyt8WChBGgE5SDCdCk,11673
|
5
|
+
pandas_plots-0.12.26.dist-info/licenses/LICENSE,sha256=ltLbQWUCs-GBQlTPXbt5nHNBE9U5LzjjoS1Y8hHETM4,1051
|
6
|
+
pandas_plots-0.12.26.dist-info/METADATA,sha256=GD2hSPNemqOdnXr96lGvAr3LZQCpm0pjKgv78yd3IKk,7431
|
7
|
+
pandas_plots-0.12.26.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
8
|
+
pandas_plots-0.12.26.dist-info/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
|
9
|
+
pandas_plots-0.12.26.dist-info/top_level.txt,sha256=XnaNuIHBqMmCeh_U7nKOYTwFue_SIA0wxuDgdPmnnSk,13
|
10
|
+
pandas_plots-0.12.26.dist-info/RECORD,,
|
pandas_plots/pii.py
DELETED
@@ -1,76 +0,0 @@
|
|
1
|
-
import pandas as pd
|
2
|
-
import re
|
3
|
-
|
4
|
-
|
5
|
-
def remove_pii(
|
6
|
-
series: pd.Series,
|
7
|
-
verbose: bool = True,
|
8
|
-
logging: bool = False,
|
9
|
-
custom_regex="",
|
10
|
-
) -> pd.Index:
|
11
|
-
"""
|
12
|
-
Remove personally identifiable information (PII) from the given column.
|
13
|
-
|
14
|
-
Parameters:
|
15
|
-
- series: A pandas Series representing a column in a DataFrame.
|
16
|
-
- verbose: If True, print pii items
|
17
|
-
- logging: If True, write pii items into the file .pii.log
|
18
|
-
- custom_regex: Regex that is injected into detection
|
19
|
-
|
20
|
-
Returns:
|
21
|
-
- index object with indexes of all pii items
|
22
|
-
|
23
|
-
Remarks:
|
24
|
-
- df.drop(axis=0, index=result, inplace=True)
|
25
|
-
"""
|
26
|
-
|
27
|
-
# * reject empty columns
|
28
|
-
assert len(series) > 0
|
29
|
-
|
30
|
-
col = series.copy()
|
31
|
-
|
32
|
-
# * na must be dropped to ensure processsing
|
33
|
-
col.dropna(inplace=True)
|
34
|
-
|
35
|
-
# * find terms
|
36
|
-
_terms = frozenset(["lösch", "herr", "frau", "strasse", "klinik"])
|
37
|
-
idx_terms = col[
|
38
|
-
col.str.contains(
|
39
|
-
"|".join(_terms),
|
40
|
-
case=False,
|
41
|
-
regex=True,
|
42
|
-
)
|
43
|
-
].index
|
44
|
-
|
45
|
-
# # * optional: search for terms in whole df
|
46
|
-
# df.apply(lambda row: row.astype(str).str.contains('test', case=False, regex=True).any(), axis=1)
|
47
|
-
|
48
|
-
# # * find dates
|
49
|
-
ptr_date = r"\d{2}\.\d{2}\.\d{4}"
|
50
|
-
idx_date = col[col.str.contains(ptr_date, regex=True)].index
|
51
|
-
|
52
|
-
# * dr
|
53
|
-
ptr_dr = r"[D|d][R|r]\. | Fr\. | Hr\. | PD "
|
54
|
-
idx_dr = col[col.str.contains(ptr_dr, regex=True)].index
|
55
|
-
|
56
|
-
# * custom
|
57
|
-
idx_custom = (
|
58
|
-
col[col.str.contains(custom_regex, regex=True)].index
|
59
|
-
if custom_regex
|
60
|
-
else pd.Index([])
|
61
|
-
)
|
62
|
-
|
63
|
-
idx_all = idx_terms.union(idx_date).union(idx_dr).union(idx_custom)
|
64
|
-
|
65
|
-
if verbose:
|
66
|
-
# print(f"found: {idx_dr.__len__()} dr | {idx_date.__len__()} date | {idx_terms.__len__()} terms")
|
67
|
-
print(f"found {idx_all.__len__():_} pii items:")
|
68
|
-
print(col.loc[idx_all].tolist())
|
69
|
-
|
70
|
-
if logging: # Assuming logging is defined and has the correct value
|
71
|
-
data = col.loc[idx_all] # Assuming col and idx_all are defined
|
72
|
-
with open(".pii.log", "w") as f:
|
73
|
-
# ! when using str(), it will give only a summary!
|
74
|
-
f.write(data.to_string(index=True))
|
75
|
-
|
76
|
-
return idx_all
|
@@ -1,11 +0,0 @@
|
|
1
|
-
pandas_plots/hlp.py,sha256=kSqoGMEaOtC94wtTS7CMFXMgptv-2tSOMf5Zm7euhpI,20838
|
2
|
-
pandas_plots/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
|
3
|
-
pandas_plots/pls.py,sha256=jFsHvjG8fvLBdHpaYOX_5TgpDrcA5bMWjAUtXb6bVXo,48629
|
4
|
-
pandas_plots/tbl.py,sha256=RJWBHeKGTAhGpVCY57TsS_dYR-FpInP-TOsKW_tU4V4,32556
|
5
|
-
pandas_plots/ven.py,sha256=2x3ACo2vSfO3q6fv-UdDQ0h1SJyt8WChBGgE5SDCdCk,11673
|
6
|
-
pandas_plots-0.12.24.dist-info/licenses/LICENSE,sha256=ltLbQWUCs-GBQlTPXbt5nHNBE9U5LzjjoS1Y8hHETM4,1051
|
7
|
-
pandas_plots-0.12.24.dist-info/METADATA,sha256=5519ufLPkBZEaylDrN6lC-D5Rtc7xr4tGQVNDtW_5Ms,7564
|
8
|
-
pandas_plots-0.12.24.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
|
9
|
-
pandas_plots-0.12.24.dist-info/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
|
10
|
-
pandas_plots-0.12.24.dist-info/top_level.txt,sha256=XnaNuIHBqMmCeh_U7nKOYTwFue_SIA0wxuDgdPmnnSk,13
|
11
|
-
pandas_plots-0.12.24.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|