pandas-plots 0.12.29__tar.gz → 0.13.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pandas-plots
3
- Version: 0.12.29
3
+ Version: 0.13.0
4
4
  Summary: A collection of helper for table handling and visualization
5
5
  Home-page: https://github.com/smeisegeier/pandas-plots
6
6
  Author: smeisegeier
@@ -1,6 +1,6 @@
1
1
  [metadata]
2
2
  name = pandas-plots
3
- version = 0.12.29
3
+ version = 0.13.0
4
4
  author = smeisegeier
5
5
  author_email = dexterDSDo@googlemail.com
6
6
  description = A collection of helper for table handling and visualization
@@ -1,6 +1,5 @@
1
1
  from pathlib import Path
2
2
  import warnings
3
-
4
3
  warnings.filterwarnings("ignore")
5
4
 
6
5
  import os
@@ -13,6 +12,7 @@ from plotly import express as px
13
12
  import plotly.graph_objects as go
14
13
  from plotly.subplots import make_subplots
15
14
  import plotly # needed for return types
15
+ import re
16
16
 
17
17
  from .hlp import *
18
18
  from .tbl import print_summary
@@ -1437,6 +1437,229 @@ def plot_facet_stacked_bars(
1437
1437
 
1438
1438
  return fig
1439
1439
 
1440
+
1441
+ def plot_sankey(df=None, max_events_per_id=None, height=None, width=None, exclude_overlap_id=False, exclude_overlap_event=False, renderer=None, show_start_node=True):
1442
+ """
1443
+ Generates a Sankey diagram from a Pandas DataFrame, assuming the column order is:
1444
+ 1. ID (string or integer)
1445
+ 2. Date (date, datetime, or string convertible to numeric)
1446
+ 3. Event Name (string)
1447
+
1448
+ Nodes represent the order of events (e.g., "[1] op", "[2] syst").
1449
+ A default demo is shown if no DataFrame is provided.
1450
+
1451
+ Args:
1452
+ df (pd.DataFrame, optional): A Pandas DataFrame containing the event data.
1453
+ Expected column order: ID, Date, Event.
1454
+ max_events_per_id (int, optional): The maximum number of events to display for each ID.
1455
+ If None, all events for each ID will be used.
1456
+ height (int, optional): The height of the plot in pixels.
1457
+ width (int, optional): The width of the plot in pixels.
1458
+ exclude_overlap_id (bool): If True, excludes any IDs that have multiple events on the same date.
1459
+ This takes precedence over `exclude_overlap_event`.
1460
+ exclude_overlap_event (bool): If True, only excludes the specific events that fall on the same date,
1461
+ retaining other non-overlapping events for that ID.
1462
+ renderer (str, optional): The renderer to use for displaying the plot. Options include
1463
+ 'browser', 'notebook', 'json', 'png', 'svg', 'jpeg', 'webp', or 'pdf'.
1464
+ If None, plotly's default renderer is used.
1465
+ show_start_node (bool): If True, adds a visual 'start' node and links all
1466
+ first events to it. This is useful for visualizing
1467
+ IDs with only one event.
1468
+ """
1469
+ # --- Example Usage with Enlarged Pandas DataFrame if no DataFrame is provided ---
1470
+ if df is None:
1471
+ data_demo = { # Renamed to data_demo for clarity
1472
+ 'tumor-id': [
1473
+ '1', '1', '1', '1', '1',
1474
+ '2', '2', '2', '2',
1475
+ '3', '3', '3', '3',
1476
+ '4', '4', '4',
1477
+ '5', '5',
1478
+ '6', '6',
1479
+ '7', '7',
1480
+ '8',
1481
+ '9',
1482
+ '10',
1483
+ '11',
1484
+ '12'
1485
+ ],
1486
+ 'diagnosis date': [
1487
+ '2020-01-01', '2021-02-01', '2022-03-01', '2023-04-01', '2024-05-01', # Tumor 1
1488
+ '2010-01-01', '2011-02-01', '2012-03-01', '2013-04-01', # Tumor 2
1489
+ '2015-01-01', '2016-02-01', '2017-03-01', '2018-04-01', # Tumor 3
1490
+ '2005-01-01', '2006-02-01', '2007-03-01', # Tumor 4
1491
+ '2019-01-01', '2020-02-01', # Tumor 5
1492
+ '2021-01-01', '2022-02-01', # Tumor 6
1493
+ '2014-01-01', '2015-02-01', # Tumor 7
1494
+ '2025-01-01', # Tumor 8 (single event)
1495
+ '2025-02-01', # Tumor 9 (single event)
1496
+ '2025-03-01', # Tumor 10 (single event)
1497
+ '2025-04-01', # Tumor 11 (single event)
1498
+ '2025-05-01' # Tumor 12 (single event)
1499
+ ],
1500
+ 'treatment': [
1501
+ 'op', 'syst', 'op', 'rad', 'op', # Tumor 1
1502
+ 'syst', 'st', 'op', 'rad', # Tumor 2
1503
+ 'op', 'rad', 'syst', 'op', # Tumor 3
1504
+ 'st', 'syst', 'op', # Tumor 4
1505
+ 'op', 'rad', # Tumor 5
1506
+ 'syst', 'op', # Tumor 6
1507
+ 'st', 'rad', # Tumor 7
1508
+ 'op', # Tumor 8
1509
+ 'op', # Tumor 9
1510
+ 'syst', # Tumor 10
1511
+ 'rad', # Tumor 11
1512
+ 'op' # Tumor 12
1513
+ ]
1514
+ }
1515
+ df = pd.DataFrame(data_demo)
1516
+ print("--- Using demo data (data_demo) ---")
1517
+ print(df.head().to_string()) # Print first 5 rows of the DataFrame prettily
1518
+ print("-----------------------------------")
1519
+
1520
+ # --- Simplified Column Recognition based on index ---
1521
+ id_col_name = df.columns[0]
1522
+ date_col_name = df.columns[1]
1523
+ event_col_name = df.columns[2]
1524
+
1525
+ df_processed = df.copy()
1526
+
1527
+ # --- Aggregate the data to remove duplicate rows before processing ---
1528
+ df_processed = df_processed.drop_duplicates(subset=[id_col_name, date_col_name, event_col_name])
1529
+
1530
+ try:
1531
+ df_processed[date_col_name] = pd.to_datetime(df_processed[date_col_name])
1532
+ except (ValueError, TypeError):
1533
+ print(f"Error: Could not convert column '{date_col_name}' to a valid date format.")
1534
+ return None
1535
+
1536
+ # --- Handle overlap exclusion based on user selection ---
1537
+ overlap_title_part = ""
1538
+ if exclude_overlap_id:
1539
+ overlapping_ids = df_processed.groupby([id_col_name, date_col_name]).size().loc[lambda x: x > 1].index.get_level_values(id_col_name).unique()
1540
+ df_processed = df_processed[~df_processed[id_col_name].isin(overlapping_ids)].copy()
1541
+ overlap_title_part = ", overlap ids excluded"
1542
+ elif exclude_overlap_event:
1543
+ overlapping_event_set = set(df_processed.groupby([id_col_name, date_col_name]).size().loc[lambda x: x > 1].index)
1544
+ df_processed = df_processed[~df_processed.set_index([id_col_name, date_col_name]).index.isin(overlapping_event_set)].copy()
1545
+ overlap_title_part = ", overlap events excluded"
1546
+
1547
+ df_sorted = df_processed.sort_values(by=[id_col_name, date_col_name])
1548
+
1549
+ # --- Performance Optimization: Use vectorized operations instead of loops ---
1550
+ df_sorted['event_order'] = df_sorted.groupby(id_col_name).cumcount() + 1
1551
+
1552
+ if max_events_per_id is not None:
1553
+ df_sorted = df_sorted[df_sorted['event_order'] <= max_events_per_id]
1554
+
1555
+ df_sorted['ordered_event_label'] = '[' + df_sorted['event_order'].astype(str) + '] ' + df_sorted[event_col_name]
1556
+
1557
+ if df_sorted.empty:
1558
+ print("No valid data to plot after filtering.")
1559
+ return None
1560
+
1561
+ # Use a vectorized shift operation to create source and target columns
1562
+ df_sorted['source_label'] = df_sorted.groupby(id_col_name)['ordered_event_label'].shift(1)
1563
+ df_with_links = df_sorted.dropna(subset=['source_label']).copy()
1564
+
1565
+ # Create the start node and links if enabled
1566
+ if show_start_node:
1567
+ first_events = df_sorted.groupby(id_col_name).first().reset_index()
1568
+ first_events['source_label'] = "[0] start"
1569
+ df_with_links = pd.concat([first_events[['source_label', 'ordered_event_label']], df_with_links[['source_label', 'ordered_event_label']]], ignore_index=True)
1570
+
1571
+ link_counts = df_with_links.groupby(['source_label', 'ordered_event_label']).size().reset_index(name='value')
1572
+
1573
+ # Get all unique nodes for the labels and sorting
1574
+ all_labels = pd.concat([link_counts['source_label'], link_counts['ordered_event_label']]).unique()
1575
+ unique_labels_df = pd.DataFrame(all_labels, columns=['label'])
1576
+ unique_labels_df['event_order_num'] = unique_labels_df['label'].str.extract(r'\[(\d+)\]').astype(float).fillna(0)
1577
+ unique_labels_df['event_name'] = unique_labels_df['label'].str.extract(r'\] (.*)').fillna('start')
1578
+ unique_labels_df_sorted = unique_labels_df.sort_values(by=['event_order_num', 'event_name'])
1579
+ unique_unformatted_labels_sorted = unique_labels_df_sorted['label'].tolist()
1580
+
1581
+ label_to_index = {label: i for i, label in enumerate(unique_unformatted_labels_sorted)}
1582
+
1583
+ # Calculate total unique IDs for percentage calculation
1584
+ total_unique_ids = df_processed[id_col_name].nunique()
1585
+
1586
+ display_labels = []
1587
+ node_counts = df_sorted['ordered_event_label'].value_counts()
1588
+ for label in unique_unformatted_labels_sorted:
1589
+ if label == "[0] start":
1590
+ count = total_unique_ids
1591
+ else:
1592
+ count = node_counts.get(label, 0)
1593
+
1594
+ percentage = (count / total_unique_ids) * 100
1595
+ formatted_count = f"{count:,}".replace(',', '_')
1596
+ formatted_percentage = f"({int(round(percentage, 0))}%)"
1597
+
1598
+ display_labels.append(f"{label} {formatted_count} {formatted_percentage}")
1599
+
1600
+ # Map sources and targets to indices
1601
+ sources = link_counts['source_label'].map(label_to_index).tolist()
1602
+ targets = link_counts['ordered_event_label'].map(label_to_index).tolist()
1603
+ values = link_counts['value'].tolist()
1604
+
1605
+ # Define a color palette for links
1606
+ color_palette = [
1607
+ "rgba(255, 99, 71, 0.6)", "rgba(60, 179, 113, 0.6)", "rgba(65, 105, 225, 0.6)",
1608
+ "rgba(255, 215, 0, 0.6)", "rgba(147, 112, 219, 0.6)", "rgba(0, 206, 209, 0.6)",
1609
+ "rgba(255, 160, 122, 0.6)", "rgba(124, 252, 0, 0.6)", "rgba(30, 144, 255, 0.6)",
1610
+ "rgba(218, 165, 32, 0.6)"
1611
+ ]
1612
+ start_link_color = "rgba(128, 128, 128, 0.6)"
1613
+
1614
+ link_colors = []
1615
+ link_type_to_color = {}
1616
+ color_index = 0
1617
+ for i, row in link_counts.iterrows():
1618
+ source_l = row['source_label']
1619
+ target_l = row['ordered_event_label']
1620
+ if source_l == "[0] start":
1621
+ link_colors.append(start_link_color)
1622
+ else:
1623
+ source_event_name = re.search(r'\] (.*)', source_l).group(1)
1624
+ target_event_name = re.search(r'\] (.*)', target_l).group(1)
1625
+ link_type = (source_event_name, target_event_name)
1626
+
1627
+ if link_type not in link_type_to_color:
1628
+ link_type_to_color[link_type] = color_palette[color_index % len(color_palette)]
1629
+ color_index += 1
1630
+ link_colors.append(link_type_to_color[link_type])
1631
+
1632
+ formatted_total_ids = f"{total_unique_ids:,}".replace(',', '_')
1633
+ total_rows = len(df_processed)
1634
+ formatted_total_rows = f"{total_rows:,}".replace(',', '_')
1635
+
1636
+ chart_title = f"[{id_col_name}] over [{event_col_name}]"
1637
+ if max_events_per_id is not None:
1638
+ chart_title += f", top {max_events_per_id} events"
1639
+ chart_title += overlap_title_part
1640
+ chart_title += f", n = {formatted_total_ids} ({formatted_total_rows})"
1641
+
1642
+ fig = go.Figure(data=[go.Sankey(
1643
+ node=dict(
1644
+ pad=15,
1645
+ thickness=20,
1646
+ line=dict(color="black", width=0.5),
1647
+ label=display_labels,
1648
+ color="blue",
1649
+ align="left"
1650
+ ),
1651
+ link=dict(
1652
+ source=sources,
1653
+ target=targets,
1654
+ value=values,
1655
+ color=link_colors
1656
+ )
1657
+ )])
1658
+
1659
+ fig.update_layout(title_text=chart_title, font_size=10, height=height, width=width)
1660
+ fig.show(renderer=renderer)
1661
+
1662
+
1440
1663
  # * extend objects to enable chaining
1441
1664
  pd.DataFrame.plot_bars = plot_bars
1442
1665
  pd.DataFrame.plot_stacked_bars = plot_stacked_bars
@@ -1445,4 +1668,5 @@ pd.DataFrame.plot_stacked_box = plot_box
1445
1668
  pd.DataFrame.plot_stacked_boxes = plot_boxes
1446
1669
  pd.DataFrame.plot_quadrants = plot_quadrants
1447
1670
  pd.DataFrame.plot_histogram = plot_histogram
1448
- pd.DataFrame.plot_joint = plot_joint
1671
+ pd.DataFrame.plot_joint = plot_joint
1672
+ pd.DataFrame.plot_sankey = plot_sankey
@@ -172,6 +172,9 @@ def describe_df(
172
172
  # * fix bug(?) in plotly/choreographer - datetime columns are not plotted, set these to str
173
173
  datetime_cols = df.select_dtypes(include=['datetime64']).columns
174
174
  df[datetime_cols] = df[datetime_cols].astype(str)
175
+
176
+ # ! Drop completely empty columns (Series)
177
+ df = df.dropna(axis=1, how='all')
175
178
 
176
179
  # * reduce column names len if selected
177
180
  if top_n_chars_in_columns > 0:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pandas-plots
3
- Version: 0.12.29
3
+ Version: 0.13.0
4
4
  Summary: A collection of helper for table handling and visualization
5
5
  Home-page: https://github.com/smeisegeier/pandas-plots
6
6
  Author: smeisegeier
File without changes
File without changes