PyPI - pandas-plots - Versions diffs - 0.12.30__py3-none-any.whl → 0.14.0__py3-none-any.whl - Mend

pandas-plots 0.12.30py3-none-any.whl → 0.14.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

pandas_plots/__init__.py +0 -0
pandas_plots/pls.py +226 -2
{pandas_plots-0.12.30.dist-info → pandas_plots-0.14.0.dist-info}/METADATA +21 -24
pandas_plots-0.14.0.dist-info/RECORD +9 -0
{pandas_plots-0.12.30.dist-info → pandas_plots-0.14.0.dist-info}/WHEEL +1 -2
pandas_plots-0.12.30.dist-info/RECORD +0 -10
pandas_plots-0.12.30.dist-info/pii.py +0 -76
pandas_plots-0.12.30.dist-info/top_level.txt +0 -1
{pandas_plots-0.12.30.dist-info → pandas_plots-0.14.0.dist-info}/licenses/LICENSE +0 -0

pandas_plots/__init__.py ADDED Viewed

File without changes

pandas_plots/pls.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from pathlib import Path
 import warnings
 warnings.filterwarnings("ignore")
 import os
@@ -13,6 +12,7 @@ from plotly import express as px
 import plotly.graph_objects as go
 from plotly.subplots import make_subplots
 import plotly  # needed for return types
+import re
 from .hlp import *
 from .tbl import print_summary
@@ -1437,6 +1437,229 @@ def plot_facet_stacked_bars(
     return fig
+def plot_sankey(df=None, max_events_per_id=None, height=None, width=None, exclude_overlap_id=False, exclude_overlap_event=False, renderer=None, show_start_node=True):
+    """
+    Generates a Sankey diagram from a Pandas DataFrame, assuming the column order is:
+    1. ID (string or integer)
+    2. Date (date, datetime, or string convertible to numeric)
+    3. Event Name (string)
+    Nodes represent the order of events (e.g., "[1] op", "[2] syst").
+    A default demo is shown if no DataFrame is provided.
+    Args:
+        df (pd.DataFrame, optional): A Pandas DataFrame containing the event data.
+                           Expected column order: ID, Date, Event.
+        max_events_per_id (int, optional): The maximum number of events to display for each ID.
+                                           If None, all events for each ID will be used.
+        height (int, optional): The height of the plot in pixels.
+        width (int, optional): The width of the plot in pixels.
+        exclude_overlap_id (bool): If True, excludes any IDs that have multiple events on the same date.
+                                   This takes precedence over `exclude_overlap_event`.
+        exclude_overlap_event (bool): If True, only excludes the specific events that fall on the same date,
+                                      retaining other non-overlapping events for that ID.
+        renderer (str, optional): The renderer to use for displaying the plot. Options include
+                                  'browser', 'notebook', 'json', 'png', 'svg', 'jpeg', 'webp', or 'pdf'.
+                                  If None, plotly's default renderer is used.
+        show_start_node (bool): If True, adds a visual 'start' node and links all
+                                first events to it. This is useful for visualizing
+                                IDs with only one event.
+    """
+    # --- Example Usage with Enlarged Pandas DataFrame if no DataFrame is provided ---
+    if df is None:
+        data_demo = { # Renamed to data_demo for clarity
+            'tumor-id': [
+                '1', '1', '1', '1', '1',
+                '2', '2', '2', '2',
+                '3', '3', '3', '3',
+                '4', '4', '4',
+                '5', '5',
+                '6', '6',
+                '7', '7',
+                '8',
+                '9',
+                '10',
+                '11',
+                '12'
+            ],
+            'diagnosis date': [
+                '2020-01-01', '2021-02-01', '2022-03-01', '2023-04-01', '2024-05-01', # Tumor 1
+                '2010-01-01', '2011-02-01', '2012-03-01', '2013-04-01',               # Tumor 2
+                '2015-01-01', '2016-02-01', '2017-03-01', '2018-04-01',               # Tumor 3
+                '2005-01-01', '2006-02-01', '2007-03-01',                             # Tumor 4
+                '2019-01-01', '2020-02-01',                                           # Tumor 5
+                '2021-01-01', '2022-02-01',                                           # Tumor 6
+                '2014-01-01', '2015-02-01',                                           # Tumor 7
+                '2025-01-01',                                                         # Tumor 8 (single event)
+                '2025-02-01',                                                         # Tumor 9 (single event)
+                '2025-03-01',                                                         # Tumor 10 (single event)
+                '2025-04-01',                                                         # Tumor 11 (single event)
+                '2025-05-01'                                                          # Tumor 12 (single event)
+            ],
+            'treatment': [
+                'op', 'syst', 'op', 'rad', 'op', # Tumor 1
+                'syst', 'st', 'op', 'rad',       # Tumor 2
+                'op', 'rad', 'syst', 'op',       # Tumor 3
+                'st', 'syst', 'op',              # Tumor 4
+                'op', 'rad',                     # Tumor 5
+                'syst', 'op',                    # Tumor 6
+                'st', 'rad',                     # Tumor 7
+                'op',                            # Tumor 8
+                'op',                            # Tumor 9
+                'syst',                          # Tumor 10
+                'rad',                           # Tumor 11
+                'op'                             # Tumor 12
+            ]
+        }
+        df = pd.DataFrame(data_demo)
+        print("--- Using demo data (data_demo) ---")
+        print(df.head().to_string()) # Print first 5 rows of the DataFrame prettily
+        print("-----------------------------------")
+    # --- Simplified Column Recognition based on index ---
+    id_col_name = df.columns[0]
+    date_col_name = df.columns[1]
+    event_col_name = df.columns[2]
+    df_processed = df.copy()
+    # --- Aggregate the data to remove duplicate rows before processing ---
+    df_processed = df_processed.drop_duplicates(subset=[id_col_name, date_col_name, event_col_name])
+    try:
+        df_processed[date_col_name] = pd.to_datetime(df_processed[date_col_name])
+    except (ValueError, TypeError):
+        print(f"Error: Could not convert column '{date_col_name}' to a valid date format.")
+        return None
+    # --- Handle overlap exclusion based on user selection ---
+    overlap_title_part = ""
+    if exclude_overlap_id:
+        overlapping_ids = df_processed.groupby([id_col_name, date_col_name]).size().loc[lambda x: x > 1].index.get_level_values(id_col_name).unique()
+        df_processed = df_processed[~df_processed[id_col_name].isin(overlapping_ids)].copy()
+        overlap_title_part = ", overlap ids excluded"
+    elif exclude_overlap_event:
+        overlapping_event_set = set(df_processed.groupby([id_col_name, date_col_name]).size().loc[lambda x: x > 1].index)
+        df_processed = df_processed[~df_processed.set_index([id_col_name, date_col_name]).index.isin(overlapping_event_set)].copy()
+        overlap_title_part = ", overlap events excluded"
+    df_sorted = df_processed.sort_values(by=[id_col_name, date_col_name])
+    # --- Performance Optimization: Use vectorized operations instead of loops ---
+    df_sorted['event_order'] = df_sorted.groupby(id_col_name).cumcount() + 1
+    if max_events_per_id is not None:
+        df_sorted = df_sorted[df_sorted['event_order'] <= max_events_per_id]
+    df_sorted['ordered_event_label'] = '[' + df_sorted['event_order'].astype(str) + '] ' + df_sorted[event_col_name]
+    if df_sorted.empty:
+        print("No valid data to plot after filtering.")
+        return None
+    # Use a vectorized shift operation to create source and target columns
+    df_sorted['source_label'] = df_sorted.groupby(id_col_name)['ordered_event_label'].shift(1)
+    df_with_links = df_sorted.dropna(subset=['source_label']).copy()
+    # Create the start node and links if enabled
+    if show_start_node:
+        first_events = df_sorted.groupby(id_col_name).first().reset_index()
+        first_events['source_label'] = "[0] start"
+        df_with_links = pd.concat([first_events[['source_label', 'ordered_event_label']], df_with_links[['source_label', 'ordered_event_label']]], ignore_index=True)
+    link_counts = df_with_links.groupby(['source_label', 'ordered_event_label']).size().reset_index(name='value')
+    # Get all unique nodes for the labels and sorting
+    all_labels = pd.concat([link_counts['source_label'], link_counts['ordered_event_label']]).unique()
+    unique_labels_df = pd.DataFrame(all_labels, columns=['label'])
+    unique_labels_df['event_order_num'] = unique_labels_df['label'].str.extract(r'\[(\d+)\]').astype(float).fillna(0)
+    unique_labels_df['event_name'] = unique_labels_df['label'].str.extract(r'\] (.*)').fillna('start')
+    unique_labels_df_sorted = unique_labels_df.sort_values(by=['event_order_num', 'event_name'])
+    unique_unformatted_labels_sorted = unique_labels_df_sorted['label'].tolist()
+    label_to_index = {label: i for i, label in enumerate(unique_unformatted_labels_sorted)}
+    # Calculate total unique IDs for percentage calculation
+    total_unique_ids = df_processed[id_col_name].nunique()
+    display_labels = []
+    node_counts = df_sorted['ordered_event_label'].value_counts()
+    for label in unique_unformatted_labels_sorted:
+        if label == "[0] start":
+            count = total_unique_ids
+        else:
+            count = node_counts.get(label, 0)
+        percentage = (count / total_unique_ids) * 100
+        formatted_count = f"{count:,}".replace(',', '_')
+        formatted_percentage = f"({int(round(percentage, 0))}%)"
+        display_labels.append(f"{label} {formatted_count} {formatted_percentage}")
+    # Map sources and targets to indices
+    sources = link_counts['source_label'].map(label_to_index).tolist()
+    targets = link_counts['ordered_event_label'].map(label_to_index).tolist()
+    values = link_counts['value'].tolist()
+    # Define a color palette for links
+    color_palette = [
+        "rgba(255, 99, 71, 0.6)", "rgba(60, 179, 113, 0.6)", "rgba(65, 105, 225, 0.6)",
+        "rgba(255, 215, 0, 0.6)", "rgba(147, 112, 219, 0.6)", "rgba(0, 206, 209, 0.6)",
+        "rgba(255, 160, 122, 0.6)", "rgba(124, 252, 0, 0.6)", "rgba(30, 144, 255, 0.6)",
+        "rgba(218, 165, 32, 0.6)"
+    ]
+    start_link_color = "rgba(128, 128, 128, 0.6)"
+    link_colors = []
+    link_type_to_color = {}
+    color_index = 0
+    for i, row in link_counts.iterrows():
+        source_l = row['source_label']
+        target_l = row['ordered_event_label']
+        if source_l == "[0] start":
+            link_colors.append(start_link_color)
+        else:
+            source_event_name = re.search(r'\] (.*)', source_l).group(1)
+            target_event_name = re.search(r'\] (.*)', target_l).group(1)
+            link_type = (source_event_name, target_event_name)
+            if link_type not in link_type_to_color:
+                link_type_to_color[link_type] = color_palette[color_index % len(color_palette)]
+                color_index += 1
+            link_colors.append(link_type_to_color[link_type])
+    formatted_total_ids = f"{total_unique_ids:,}".replace(',', '_')
+    total_rows = len(df_processed)
+    formatted_total_rows = f"{total_rows:,}".replace(',', '_')
+    chart_title = f"[{id_col_name}] over [{event_col_name}]"
+    if max_events_per_id is not None:
+        chart_title += f", top {max_events_per_id} events"
+    chart_title += overlap_title_part
+    chart_title += f", n = {formatted_total_ids} ({formatted_total_rows})"
+    fig = go.Figure(data=[go.Sankey(
+        node=dict(
+            pad=15,
+            thickness=20,
+            line=dict(color="black", width=0.5),
+            label=display_labels,
+            color="blue",
+            align="left"
+        ),
+        link=dict(
+            source=sources,
+            target=targets,
+            value=values,
+            color=link_colors
+        )
+    )])
+    fig.update_layout(title_text=chart_title, font_size=10, height=height, width=width)
+    fig.show(renderer=renderer)
 # * extend objects to enable chaining
 pd.DataFrame.plot_bars = plot_bars
 pd.DataFrame.plot_stacked_bars = plot_stacked_bars
@@ -1445,4 +1668,5 @@ pd.DataFrame.plot_stacked_box = plot_box
 pd.DataFrame.plot_stacked_boxes = plot_boxes
 pd.DataFrame.plot_quadrants = plot_quadrants
 pd.DataFrame.plot_histogram = plot_histogram
-pd.DataFrame.plot_joint = plot_joint
+pd.DataFrame.plot_joint = plot_joint
+pd.DataFrame.plot_sankey = plot_sankey

{pandas_plots-0.12.30.dist-info → pandas_plots-0.14.0.dist-info}/METADATA RENAMED Viewed

@@ -1,38 +1,34 @@
 Metadata-Version: 2.4
 Name: pandas-plots
-Version: 0.12.30
+Version: 0.14.0
 Summary: A collection of helper for table handling and visualization
-Home-page: https://github.com/smeisegeier/pandas-plots
-Author: smeisegeier
-Author-email: dexterDSDo@googlemail.com
-License: MIT
-Project-URL: Documentation, https://github.com/smeisegeier/pandas-plots
-Project-URL: Source Code, https://github.com/smeisegeier/pandas-plots
+Project-URL: Homepage, https://github.com/smeisegeier/pandas-plots
+Project-URL: Repository, https://github.com/smeisegeier/pandas-plots
 Project-URL: Bug Tracker, https://github.com/smeisegeier/pandas-plots/issues
-Classifier: License :: OSI Approved :: MIT License
-Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.10
+Author-email: smeisegeier <meisegeiers@rki.de>
+License-File: LICENSE
+Keywords: pivot,plot,plotly,tables,venn,vizualization
 Classifier: Development Status :: 4 - Beta
 Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
 Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
 Classifier: Topic :: Scientific/Engineering
 Requires-Python: >=3.10
-Description-Content-Type: text/markdown
-License-File: LICENSE
-Requires-Dist: pandas>=2.0.0
-Requires-Dist: plotly<6
-Requires-Dist: matplotlib>=3.8.2
+Requires-Dist: dataframe-image>=0.2.6
+Requires-Dist: duckdb>=1.3.0
+Requires-Dist: jinja2>=3.1.4
 Requires-Dist: matplotlib-venn==0.11.10
-Requires-Dist: seaborn>=0.13.2
-Requires-Dist: Jinja2>=3.1.4
-Requires-Dist: requests>=2.32.0
-Requires-Dist: numpy<2.0.0
+Requires-Dist: matplotlib>=3.8.2
 Requires-Dist: missingno>=0.5.2
-Requires-Dist: duckdb>=1.0.0
-Requires-Dist: kaleido>=0.2.0
 Requires-Dist: nbformat>=4.2.0
-Requires-Dist: dataframe_image>=0.2.6
-Dynamic: license-file
+Requires-Dist: numpy<2.0.0
+Requires-Dist: pandas>=2.0.0
+Requires-Dist: plotly>=6.2
+Requires-Dist: requests>=2.32.0
+Requires-Dist: seaborn>=0.13.2
+Description-Content-Type: text/markdown
 # pandas-plots
@@ -98,6 +94,7 @@ tbl.show_num_df(
   - `plot_joints()` a joint plot for **exactly two numerical** columns
   - `plot_quadrants()` quickly shows a 2x2 heatmap
   - `plot_facet_stacked_bars()` shows stacked bars for a facet value as subplots
+  - `plot_sankey()` generates a Sankey diagram
 <br>
 - `ven` offers functions for _venn diagrams_
@@ -175,4 +172,4 @@ _df, _details = ven.show_venn3(
 ## tags
-#pandas, #plotly, #visualizations, #statistics
+#pandas, #plotly, #visualizations, #statistics

pandas_plots-0.14.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+pandas_plots/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+pandas_plots/hlp.py,sha256=z8rrVNbH9qMohdXPT-FksP-VkTOjI0bGFj47Sw5p3aY,21141
+pandas_plots/pls.py,sha256=80uXr3bT66LGjDcuT4a0ewCBwATcOUZ3QQ228Hn9glY,60052
+pandas_plots/tbl.py,sha256=R2E6FLhxNpUtS88Zf88Eh9i8dSKgmJtmFimFvOt0foQ,32780
+pandas_plots/ven.py,sha256=2x3ACo2vSfO3q6fv-UdDQ0h1SJyt8WChBGgE5SDCdCk,11673
+pandas_plots-0.14.0.dist-info/METADATA,sha256=tw4QxZ9io1c9MgSESxsrGHdKXqoTr9-xNfOpV5hxfUo,7394
+pandas_plots-0.14.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+pandas_plots-0.14.0.dist-info/licenses/LICENSE,sha256=ltLbQWUCs-GBQlTPXbt5nHNBE9U5LzjjoS1Y8hHETM4,1051
+pandas_plots-0.14.0.dist-info/RECORD,,

{pandas_plots-0.12.30.dist-info → pandas_plots-0.14.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,4 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.9.0)
+Generator: hatchling 1.27.0
 Root-Is-Purelib: true
 Tag: py3-none-any

pandas_plots-0.12.30.dist-info/RECORD DELETED Viewed

@@ -1,10 +0,0 @@
-pandas_plots/hlp.py,sha256=z8rrVNbH9qMohdXPT-FksP-VkTOjI0bGFj47Sw5p3aY,21141
-pandas_plots/pls.py,sha256=M-UrYcgQHWUeiuqjrq2TNJHca5cHfvS5pEp66Qu2Nrs,48886
-pandas_plots/tbl.py,sha256=R2E6FLhxNpUtS88Zf88Eh9i8dSKgmJtmFimFvOt0foQ,32780
-pandas_plots/ven.py,sha256=2x3ACo2vSfO3q6fv-UdDQ0h1SJyt8WChBGgE5SDCdCk,11673
-pandas_plots-0.12.30.dist-info/licenses/LICENSE,sha256=ltLbQWUCs-GBQlTPXbt5nHNBE9U5LzjjoS1Y8hHETM4,1051
-pandas_plots-0.12.30.dist-info/METADATA,sha256=vI58okMF7hvV_OuE7dRDQxKyP_W7umWL6_PntoMK2L0,7431
-pandas_plots-0.12.30.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-pandas_plots-0.12.30.dist-info/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
-pandas_plots-0.12.30.dist-info/top_level.txt,sha256=XnaNuIHBqMmCeh_U7nKOYTwFue_SIA0wxuDgdPmnnSk,13
-pandas_plots-0.12.30.dist-info/RECORD,,

pandas_plots-0.12.30.dist-info/pii.py DELETED Viewed

@@ -1,76 +0,0 @@
-import pandas as pd
-import re
-def remove_pii(
-    series: pd.Series,
-    verbose: bool = True,
-    logging: bool = False,
-    custom_regex="",
-) -> pd.Index:
-    """
-    Remove personally identifiable information (PII) from the given column.
-    Parameters:
-    - series: A pandas Series representing a column in a DataFrame.
-    - verbose: If True, print pii items
-    - logging: If True, write pii items into the file .pii.log
-    - custom_regex: Regex that is injected into detection
-    Returns:
-    - index object with indexes of all pii items
-    Remarks:
-    - df.drop(axis=0, index=result, inplace=True)
-    """
-    # * reject empty columns
-    assert len(series) > 0
-    col = series.copy()
-    # * na must be dropped to ensure processsing
-    col.dropna(inplace=True)
-    # * find terms
-    _terms = frozenset(["lösch", "herr", "frau", "strasse", "klinik"])
-    idx_terms = col[
-        col.str.contains(
-            "|".join(_terms),
-            case=False,
-            regex=True,
-        )
-    ].index
-    # # * optional: search for terms in whole df
-    # df.apply(lambda row: row.astype(str).str.contains('test', case=False, regex=True).any(), axis=1)
-    # # * find dates
-    ptr_date = r"\d{2}\.\d{2}\.\d{4}"
-    idx_date = col[col.str.contains(ptr_date, regex=True)].index
-    # * dr
-    ptr_dr = r"[D|d][R|r]\. | Fr\. | Hr\. | PD "
-    idx_dr = col[col.str.contains(ptr_dr, regex=True)].index
-    # * custom
-    idx_custom = (
-        col[col.str.contains(custom_regex, regex=True)].index
-        if custom_regex
-        else pd.Index([])
-    )
-    idx_all = idx_terms.union(idx_date).union(idx_dr).union(idx_custom)
-    if verbose:
-        # print(f"found: {idx_dr.__len__()} dr | {idx_date.__len__()} date | {idx_terms.__len__()} terms")
-        print(f"found {idx_all.__len__():_} pii items:")
-        print(col.loc[idx_all].tolist())
-    if logging:  # Assuming logging is defined and has the correct value
-        data = col.loc[idx_all]  # Assuming col and idx_all are defined
-        with open(".pii.log", "w") as f:
-            # ! when using str(), it will give only a summary!
-            f.write(data.to_string(index=True))
-    return idx_all

pandas_plots-0.12.30.dist-info/top_level.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- pandas_plots

{pandas_plots-0.12.30.dist-info → pandas_plots-0.14.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

pandas-plots 0.12.30__py3-none-any.whl → 0.14.0__py3-none-any.whl

pandas-plots 0.12.30py3-none-any.whl → 0.14.0py3-none-any.whl