pandas-plots 0.12.24__tar.gz → 0.12.26__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pandas-plots
3
- Version: 0.12.24
3
+ Version: 0.12.26
4
4
  Summary: A collection of helper for table handling and visualization
5
5
  Home-page: https://github.com/smeisegeier/pandas-plots
6
6
  Author: smeisegeier
@@ -49,7 +49,7 @@ pip install pandas-plots -U
49
49
  include in python
50
50
 
51
51
  ```python
52
- from pandas_plots import tbl, pls, ven, hlp, pii
52
+ from pandas_plots import tbl, pls, ven, hlp
53
53
  ```
54
54
 
55
55
  ## example
@@ -119,9 +119,6 @@ tbl.show_num_df(
119
119
  - `add_measures_to_pyg_config()` adds measures to a pygwalker config file to avoid frequent manual update
120
120
  <br>
121
121
 
122
- - `pii` has routines for handling of personally identifiable information
123
- - `remove_pii()` logs and deletes pii from a series
124
-
125
122
  > note: theme setting can be controlled through all functions by setting the environment variable `THEME` to either light or dark
126
123
 
127
124
  ## more examples
@@ -13,7 +13,7 @@ pip install pandas-plots -U
13
13
  include in python
14
14
 
15
15
  ```python
16
- from pandas_plots import tbl, pls, ven, hlp, pii
16
+ from pandas_plots import tbl, pls, ven, hlp
17
17
  ```
18
18
 
19
19
  ## example
@@ -83,9 +83,6 @@ tbl.show_num_df(
83
83
  - `add_measures_to_pyg_config()` adds measures to a pygwalker config file to avoid frequent manual update
84
84
  <br>
85
85
 
86
- - `pii` has routines for handling of personally identifiable information
87
- - `remove_pii()` logs and deletes pii from a series
88
-
89
86
  > note: theme setting can be controlled through all functions by setting the environment variable `THEME` to either light or dark
90
87
 
91
88
  ## more examples
@@ -1,6 +1,6 @@
1
1
  [metadata]
2
2
  name = pandas-plots
3
- version = 0.12.24
3
+ version = 0.12.26
4
4
  author = smeisegeier
5
5
  author_email = dexterDSDo@googlemail.com
6
6
  description = A collection of helper for table handling and visualization
@@ -21,44 +21,49 @@ from PIL import Image
21
21
 
22
22
  URL_REGEX = r"^(?:http|ftp)s?://" # https://stackoverflow.com/a/1617386
23
23
 
24
-
25
- def mean_confidence_interval(df, confidence=0.95, use_median=False):
24
+ def mean_confidence_interval(data, confidence=0.95, use_median=False, n_bootstraps=1000):
26
25
  """
27
- Calculate the mean or median and confidence interval of the input dataframe.
28
- Source: https://stackoverflow.com/questions/15033511/compute-a-confidence-interval-from-sample-data
26
+ Calculate the mean or median and confidence interval.
27
+ For median, uses bootstrapping for a more robust confidence interval.
29
28
 
30
29
  Parameters:
31
- df (array-like): The input dataframe.
30
+ data (array-like): The input data.
32
31
  confidence (float, optional): The confidence level for the interval. Defaults to 0.95.
33
- use_median (bool, optional): If True, calculates median and confidence interval instead of mean. Defaults to False.
32
+ use_median (bool, optional): If True, calculates median and its confidence interval. Defaults to False.
33
+ n_bootstraps (int, optional): Number of bootstrap samples for median CI. Only used if use_median is True.
34
34
 
35
35
  Returns:
36
- tuple: A tuple containing the central value (mean or median), interval, lower bound, and upper bound.
36
+ tuple: A tuple containing the central value (mean or median), margin of error, lower bound, and upper bound.
37
37
  """
38
- df = to_series(df)
39
- if df is None:
40
- return None
41
- a = 1.0 * np.array(df)
38
+ data = to_series(data)
39
+ if data is None or len(data) == 0:
40
+ return np.nan, np.nan, np.nan, np.nan
41
+ a = 1.0 * np.array(data)
42
42
  n = len(a)
43
43
 
44
44
  if use_median:
45
+ if n < 2: # Cannot bootstrap with n < 2
46
+ return np.median(a), np.nan, np.nan, np.nan
47
+
48
+ bootstrapped_medians = []
49
+ for _ in range(n_bootstraps):
50
+ sample = np.random.choice(a, size=n, replace=True)
51
+ bootstrapped_medians.append(np.median(sample))
52
+
45
53
  median = np.median(a)
46
- se = 1.253 * scipy.stats.sem(a) # Approximate standard error for median
47
- margin = se * scipy.stats.t.ppf((1 + confidence) / 2.0, n - 1)
48
- return median, margin, median - margin, median + margin
54
+ alpha = (1 - confidence) / 2
55
+ lower_bound = np.percentile(bootstrapped_medians, alpha * 100)
56
+ upper_bound = np.percentile(bootstrapped_medians, (1 - alpha) * 100)
57
+ margin = (upper_bound - lower_bound) / 2 # Simple approximation for margin based on interval width
58
+ return median, margin, lower_bound, upper_bound
49
59
  else:
50
- mean, se = np.mean(a), scipy.stats.sem(a)
60
+ mean = np.mean(a)
61
+ if n <= 1:
62
+ return mean, np.nan, np.nan, np.nan
63
+ se = scipy.stats.sem(a)
51
64
  margin = se * scipy.stats.t.ppf((1 + confidence) / 2.0, n - 1)
52
65
  return mean, margin, mean - margin, mean + margin
53
66
 
54
- # # * Alternative
55
- # # from statistics import NormalDist
56
- # def confidence_interval(data, confidence=0.95):
57
- # dist = NormalDist.from_samples(data)
58
- # z = NormalDist().inv_cdf((1 + confidence) / 2.)
59
- # h = dist.stdev * z / ((len(data) - 1) ** .5)
60
- # return dist.mean - h, dist.mean + h
61
-
62
67
 
63
68
  def to_series(df) -> pd.Series | None:
64
69
  """
@@ -563,9 +563,10 @@ def plot_bars(
563
563
 
564
564
  # * ensure df is grouped to prevent false aggregations, reset index to return df
565
565
  if use_ci:
566
- # * grouping is smoother on df than on series
567
- df = (
568
- df_in.groupby(
566
+ # * grouping is smoother on df than on series
567
+ df = (df_in
568
+ # ? dont dropna() here, this biases the input data
569
+ .groupby(
569
570
  col_index,
570
571
  dropna=False,
571
572
  )
@@ -576,10 +577,11 @@ def plot_bars(
576
577
  )
577
578
  .reset_index()
578
579
  )
579
- # * enforce vertical bars when using ci
580
+ # * enforce vertical bars **when using ci**, normalize=False, dropna=True, set empty margin to 0 to avoid dropping the bar
580
581
  orientation = "v"
581
582
  normalize = False
582
583
  dropna = True
584
+ df.margin.fillna(0, inplace=True)
583
585
  else:
584
586
  df = df_in.groupby(col_index, dropna=dropna)[col_name].sum().reset_index()
585
587
 
@@ -591,6 +593,7 @@ def plot_bars(
591
593
  else:
592
594
  df = df.fillna("<NA>")
593
595
 
596
+
594
597
  # * get n, col1 now is always numeric
595
598
  n = df[df.columns[1]].sum()
596
599
  n_len = len(df_in)
@@ -1116,6 +1119,8 @@ def plot_box(
1116
1119
  return fig
1117
1120
 
1118
1121
 
1122
+
1123
+
1119
1124
  def plot_boxes(
1120
1125
  df: pd.DataFrame,
1121
1126
  caption: str = None,
@@ -1164,14 +1169,9 @@ def plot_boxes(
1164
1169
  xlvl2 = 0
1165
1170
  xlvl3 = 50
1166
1171
 
1167
- # * not working
1168
- # yspan_seg = (df.iloc[:, 1].max() - df.iloc[:, 1].max()) * .05
1169
- # ylvl1 = -yspan_seg
1170
- # ylvl2 = 0
1171
- # ylvl3 = yspan_seg
1172
-
1173
1172
  # * unique items
1174
- items = df.iloc[:, 0].unique()
1173
+ # Sort the unique items alphabetically
1174
+ items = sorted(df.iloc[:, 0].unique())
1175
1175
 
1176
1176
  caption = _set_caption(caption)
1177
1177
  log_str = " (log-scale)" if use_log else ""
@@ -1196,6 +1196,9 @@ def plot_boxes(
1196
1196
  ),
1197
1197
  )
1198
1198
 
1199
+ # * Set the order of the x-axis categories
1200
+ fig.update_xaxes(categoryorder="array", categoryarray=items)
1201
+
1199
1202
  # * yshift is trivial
1200
1203
  YS = 0
1201
1204
 
@@ -1273,6 +1276,7 @@ def plot_boxes(
1273
1276
 
1274
1277
  return fig
1275
1278
 
1279
+
1276
1280
  def plot_facet_stacked_bars(
1277
1281
  df: pd.DataFrame,
1278
1282
  subplots_per_row: int = 4,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pandas-plots
3
- Version: 0.12.24
3
+ Version: 0.12.26
4
4
  Summary: A collection of helper for table handling and visualization
5
5
  Home-page: https://github.com/smeisegeier/pandas-plots
6
6
  Author: smeisegeier
@@ -49,7 +49,7 @@ pip install pandas-plots -U
49
49
  include in python
50
50
 
51
51
  ```python
52
- from pandas_plots import tbl, pls, ven, hlp, pii
52
+ from pandas_plots import tbl, pls, ven, hlp
53
53
  ```
54
54
 
55
55
  ## example
@@ -119,9 +119,6 @@ tbl.show_num_df(
119
119
  - `add_measures_to_pyg_config()` adds measures to a pygwalker config file to avoid frequent manual update
120
120
  <br>
121
121
 
122
- - `pii` has routines for handling of personally identifiable information
123
- - `remove_pii()` logs and deletes pii from a series
124
-
125
122
  > note: theme setting can be controlled through all functions by setting the environment variable `THEME` to either light or dark
126
123
 
127
124
  ## more examples
@@ -3,7 +3,6 @@ README.md
3
3
  pyproject.toml
4
4
  setup.cfg
5
5
  src/pandas_plots/hlp.py
6
- src/pandas_plots/pii.py
7
6
  src/pandas_plots/pls.py
8
7
  src/pandas_plots/tbl.py
9
8
  src/pandas_plots/ven.py
@@ -1,76 +0,0 @@
1
- import pandas as pd
2
- import re
3
-
4
-
5
- def remove_pii(
6
- series: pd.Series,
7
- verbose: bool = True,
8
- logging: bool = False,
9
- custom_regex="",
10
- ) -> pd.Index:
11
- """
12
- Remove personally identifiable information (PII) from the given column.
13
-
14
- Parameters:
15
- - series: A pandas Series representing a column in a DataFrame.
16
- - verbose: If True, print pii items
17
- - logging: If True, write pii items into the file .pii.log
18
- - custom_regex: Regex that is injected into detection
19
-
20
- Returns:
21
- - index object with indexes of all pii items
22
-
23
- Remarks:
24
- - df.drop(axis=0, index=result, inplace=True)
25
- """
26
-
27
- # * reject empty columns
28
- assert len(series) > 0
29
-
30
- col = series.copy()
31
-
32
- # * na must be dropped to ensure processsing
33
- col.dropna(inplace=True)
34
-
35
- # * find terms
36
- _terms = frozenset(["lösch", "herr", "frau", "strasse", "klinik"])
37
- idx_terms = col[
38
- col.str.contains(
39
- "|".join(_terms),
40
- case=False,
41
- regex=True,
42
- )
43
- ].index
44
-
45
- # # * optional: search for terms in whole df
46
- # df.apply(lambda row: row.astype(str).str.contains('test', case=False, regex=True).any(), axis=1)
47
-
48
- # # * find dates
49
- ptr_date = r"\d{2}\.\d{2}\.\d{4}"
50
- idx_date = col[col.str.contains(ptr_date, regex=True)].index
51
-
52
- # * dr
53
- ptr_dr = r"[D|d][R|r]\. | Fr\. | Hr\. | PD "
54
- idx_dr = col[col.str.contains(ptr_dr, regex=True)].index
55
-
56
- # * custom
57
- idx_custom = (
58
- col[col.str.contains(custom_regex, regex=True)].index
59
- if custom_regex
60
- else pd.Index([])
61
- )
62
-
63
- idx_all = idx_terms.union(idx_date).union(idx_dr).union(idx_custom)
64
-
65
- if verbose:
66
- # print(f"found: {idx_dr.__len__()} dr | {idx_date.__len__()} date | {idx_terms.__len__()} terms")
67
- print(f"found {idx_all.__len__():_} pii items:")
68
- print(col.loc[idx_all].tolist())
69
-
70
- if logging: # Assuming logging is defined and has the correct value
71
- data = col.loc[idx_all] # Assuming col and idx_all are defined
72
- with open(".pii.log", "w") as f:
73
- # ! when using str(), it will give only a summary!
74
- f.write(data.to_string(index=True))
75
-
76
- return idx_all
File without changes