PyPI - pandas-plots - Versions diffs - 0.12.24__tar.gz → 0.12.26__tar.gz - Mend

pandas-plots 0.12.24tar.gz → 0.12.26tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

{pandas_plots-0.12.24/src/pandas_plots.egg-info → pandas_plots-0.12.26}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pandas-plots
-Version: 0.12.24
+Version: 0.12.26
 Summary: A collection of helper for table handling and visualization
 Home-page: https://github.com/smeisegeier/pandas-plots
 Author: smeisegeier
@@ -49,7 +49,7 @@ pip install pandas-plots -U
 include in python
 ```python
-from pandas_plots import tbl, pls, ven, hlp, pii
+from pandas_plots import tbl, pls, ven, hlp
 ```
 ## example
@@ -119,9 +119,6 @@ tbl.show_num_df(
   - `add_measures_to_pyg_config()` adds measures to a pygwalker config file to avoid frequent manual update
 <br>
-- `pii` has routines for handling of personally identifiable information
-  - `remove_pii()` logs and deletes pii from a series
 > note: theme setting can be controlled through all functions by setting the environment variable `THEME` to either light or dark
 ## more examples

{pandas_plots-0.12.24 → pandas_plots-0.12.26}/README.md RENAMED Viewed

@@ -13,7 +13,7 @@ pip install pandas-plots -U
 include in python
 ```python
-from pandas_plots import tbl, pls, ven, hlp, pii
+from pandas_plots import tbl, pls, ven, hlp
 ```
 ## example
@@ -83,9 +83,6 @@ tbl.show_num_df(
   - `add_measures_to_pyg_config()` adds measures to a pygwalker config file to avoid frequent manual update
 <br>
-- `pii` has routines for handling of personally identifiable information
-  - `remove_pii()` logs and deletes pii from a series
 > note: theme setting can be controlled through all functions by setting the environment variable `THEME` to either light or dark
 ## more examples

{pandas_plots-0.12.24 → pandas_plots-0.12.26}/setup.cfg RENAMED Viewed

@@ -1,6 +1,6 @@
 [metadata]
 name = pandas-plots
-version = 0.12.24
+version = 0.12.26
 author = smeisegeier
 author_email = dexterDSDo@googlemail.com
 description = A collection of helper for table handling and visualization

{pandas_plots-0.12.24 → pandas_plots-0.12.26}/src/pandas_plots/hlp.py RENAMED Viewed

@@ -21,44 +21,49 @@ from PIL import Image
 URL_REGEX = r"^(?:http|ftp)s?://"  # https://stackoverflow.com/a/1617386
-def mean_confidence_interval(df, confidence=0.95, use_median=False):
+def mean_confidence_interval(data, confidence=0.95, use_median=False, n_bootstraps=1000):
     """
-    Calculate the mean or median and confidence interval of the input dataframe.
-    Source: https://stackoverflow.com/questions/15033511/compute-a-confidence-interval-from-sample-data
+    Calculate the mean or median and confidence interval.
+    For median, uses bootstrapping for a more robust confidence interval.
     Parameters:
-    df (array-like): The input dataframe.
+    data (array-like): The input data.
     confidence (float, optional): The confidence level for the interval. Defaults to 0.95.
-    use_median (bool, optional): If True, calculates median and confidence interval instead of mean. Defaults to False.
+    use_median (bool, optional): If True, calculates median and its confidence interval. Defaults to False.
+    n_bootstraps (int, optional): Number of bootstrap samples for median CI. Only used if use_median is True.
     Returns:
-    tuple: A tuple containing the central value (mean or median), interval, lower bound, and upper bound.
+    tuple: A tuple containing the central value (mean or median), margin of error, lower bound, and upper bound.
     """
-    df = to_series(df)
-    if df is None:
-        return None
-    a = 1.0 * np.array(df)
+    data = to_series(data)
+    if data is None or len(data) == 0:
+        return np.nan, np.nan, np.nan, np.nan
+    a = 1.0 * np.array(data)
     n = len(a)
     if use_median:
+        if n < 2: # Cannot bootstrap with n < 2
+            return np.median(a), np.nan, np.nan, np.nan
+        bootstrapped_medians = []
+        for _ in range(n_bootstraps):
+            sample = np.random.choice(a, size=n, replace=True)
+            bootstrapped_medians.append(np.median(sample))
         median = np.median(a)
-        se = 1.253 * scipy.stats.sem(a)  # Approximate standard error for median
-        margin = se * scipy.stats.t.ppf((1 + confidence) / 2.0, n - 1)
-        return median, margin, median - margin, median + margin
+        alpha = (1 - confidence) / 2
+        lower_bound = np.percentile(bootstrapped_medians, alpha * 100)
+        upper_bound = np.percentile(bootstrapped_medians, (1 - alpha) * 100)
+        margin = (upper_bound - lower_bound) / 2 # Simple approximation for margin based on interval width
+        return median, margin, lower_bound, upper_bound
     else:
-        mean, se = np.mean(a), scipy.stats.sem(a)
+        mean = np.mean(a)
+        if n <= 1:
+            return mean, np.nan, np.nan, np.nan
+        se = scipy.stats.sem(a)
         margin = se * scipy.stats.t.ppf((1 + confidence) / 2.0, n - 1)
         return mean, margin, mean - margin, mean + margin
-    # # * Alternative
-    # # from statistics import NormalDist
-    # def confidence_interval(data, confidence=0.95):
-    #     dist = NormalDist.from_samples(data)
-    #     z = NormalDist().inv_cdf((1 + confidence) / 2.)
-    #     h = dist.stdev * z / ((len(data) - 1) ** .5)
-    #     return dist.mean - h, dist.mean + h
 def to_series(df) -> pd.Series | None:
     """

{pandas_plots-0.12.24 → pandas_plots-0.12.26}/src/pandas_plots/pls.py RENAMED Viewed

@@ -563,9 +563,10 @@ def plot_bars(
     # * ensure df is grouped to prevent false aggregations, reset index to return df
     if use_ci:
-        # * grouping is smoother on df than on series
-        df = (
-            df_in.groupby(
+# * grouping is smoother on df than on series
+        df = (df_in
+            # ? dont dropna() here, this biases the input data
+            .groupby(
                 col_index,
                 dropna=False,
             )
@@ -576,10 +577,11 @@ def plot_bars(
             )
             .reset_index()
         )
-        # * enforce vertical bars when using ci
+        # * enforce vertical bars **when using ci**, normalize=False, dropna=True, set empty margin to 0 to avoid dropping the bar
         orientation = "v"
         normalize = False
         dropna = True
+        df.margin.fillna(0, inplace=True)
     else:
         df = df_in.groupby(col_index, dropna=dropna)[col_name].sum().reset_index()
@@ -591,6 +593,7 @@ def plot_bars(
     else:
         df = df.fillna("<NA>")
     # * get n, col1 now is always numeric
     n = df[df.columns[1]].sum()
     n_len = len(df_in)
@@ -1116,6 +1119,8 @@ def plot_box(
     return fig
 def plot_boxes(
     df: pd.DataFrame,
     caption: str = None,
@@ -1164,14 +1169,9 @@ def plot_boxes(
     xlvl2 = 0
     xlvl3 = 50
-    # * not working
-    # yspan_seg = (df.iloc[:, 1].max() - df.iloc[:, 1].max()) * .05
-    # ylvl1 = -yspan_seg
-    # ylvl2 = 0
-    # ylvl3 = yspan_seg
     # * unique items
-    items = df.iloc[:, 0].unique()
+    # Sort the unique items alphabetically
+    items = sorted(df.iloc[:, 0].unique())
     caption = _set_caption(caption)
     log_str = " (log-scale)" if use_log else ""
@@ -1196,6 +1196,9 @@ def plot_boxes(
         ),
     )
+    # * Set the order of the x-axis categories
+    fig.update_xaxes(categoryorder="array", categoryarray=items)
     # * yshift is trivial
     YS = 0
@@ -1273,6 +1276,7 @@ def plot_boxes(
     return fig
 def plot_facet_stacked_bars(
     df: pd.DataFrame,
     subplots_per_row: int = 4,

{pandas_plots-0.12.24 → pandas_plots-0.12.26/src/pandas_plots.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pandas-plots
-Version: 0.12.24
+Version: 0.12.26
 Summary: A collection of helper for table handling and visualization
 Home-page: https://github.com/smeisegeier/pandas-plots
 Author: smeisegeier
@@ -49,7 +49,7 @@ pip install pandas-plots -U
 include in python
 ```python
-from pandas_plots import tbl, pls, ven, hlp, pii
+from pandas_plots import tbl, pls, ven, hlp
 ```
 ## example
@@ -119,9 +119,6 @@ tbl.show_num_df(
   - `add_measures_to_pyg_config()` adds measures to a pygwalker config file to avoid frequent manual update
 <br>
-- `pii` has routines for handling of personally identifiable information
-  - `remove_pii()` logs and deletes pii from a series
 > note: theme setting can be controlled through all functions by setting the environment variable `THEME` to either light or dark
 ## more examples

{pandas_plots-0.12.24 → pandas_plots-0.12.26}/src/pandas_plots.egg-info/SOURCES.txt RENAMED Viewed

@@ -3,7 +3,6 @@ README.md
 pyproject.toml
 setup.cfg
 src/pandas_plots/hlp.py
-src/pandas_plots/pii.py
 src/pandas_plots/pls.py
 src/pandas_plots/tbl.py
 src/pandas_plots/ven.py

pandas_plots-0.12.24/src/pandas_plots/pii.py DELETED Viewed

@@ -1,76 +0,0 @@
-import pandas as pd
-import re
-def remove_pii(
-    series: pd.Series,
-    verbose: bool = True,
-    logging: bool = False,
-    custom_regex="",
-) -> pd.Index:
-    """
-    Remove personally identifiable information (PII) from the given column.
-    Parameters:
-    - series: A pandas Series representing a column in a DataFrame.
-    - verbose: If True, print pii items
-    - logging: If True, write pii items into the file .pii.log
-    - custom_regex: Regex that is injected into detection
-    Returns:
-    - index object with indexes of all pii items
-    Remarks:
-    - df.drop(axis=0, index=result, inplace=True)
-    """
-    # * reject empty columns
-    assert len(series) > 0
-    col = series.copy()
-    # * na must be dropped to ensure processsing
-    col.dropna(inplace=True)
-    # * find terms
-    _terms = frozenset(["lösch", "herr", "frau", "strasse", "klinik"])
-    idx_terms = col[
-        col.str.contains(
-            "|".join(_terms),
-            case=False,
-            regex=True,
-        )
-    ].index
-    # # * optional: search for terms in whole df
-    # df.apply(lambda row: row.astype(str).str.contains('test', case=False, regex=True).any(), axis=1)
-    # # * find dates
-    ptr_date = r"\d{2}\.\d{2}\.\d{4}"
-    idx_date = col[col.str.contains(ptr_date, regex=True)].index
-    # * dr
-    ptr_dr = r"[D|d][R|r]\. | Fr\. | Hr\. | PD "
-    idx_dr = col[col.str.contains(ptr_dr, regex=True)].index
-    # * custom
-    idx_custom = (
-        col[col.str.contains(custom_regex, regex=True)].index
-        if custom_regex
-        else pd.Index([])
-    )
-    idx_all = idx_terms.union(idx_date).union(idx_dr).union(idx_custom)
-    if verbose:
-        # print(f"found: {idx_dr.__len__()} dr | {idx_date.__len__()} date | {idx_terms.__len__()} terms")
-        print(f"found {idx_all.__len__():_} pii items:")
-        print(col.loc[idx_all].tolist())
-    if logging:  # Assuming logging is defined and has the correct value
-        data = col.loc[idx_all]  # Assuming col and idx_all are defined
-        with open(".pii.log", "w") as f:
-            # ! when using str(), it will give only a summary!
-            f.write(data.to_string(index=True))
-    return idx_all