pandas-plots 0.12.29__py3-none-any.whl → 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pandas_plots/pls.py +226 -2
- pandas_plots/tbl.py +3 -0
- {pandas_plots-0.12.29.dist-info → pandas_plots-0.13.0.dist-info}/METADATA +1 -1
- pandas_plots-0.13.0.dist-info/RECORD +10 -0
- pandas_plots-0.12.29.dist-info/RECORD +0 -10
- {pandas_plots-0.12.29.dist-info → pandas_plots-0.13.0.dist-info}/WHEEL +0 -0
- {pandas_plots-0.12.29.dist-info → pandas_plots-0.13.0.dist-info}/licenses/LICENSE +0 -0
- {pandas_plots-0.12.29.dist-info → pandas_plots-0.13.0.dist-info}/pii.py +0 -0
- {pandas_plots-0.12.29.dist-info → pandas_plots-0.13.0.dist-info}/top_level.txt +0 -0
pandas_plots/pls.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
from pathlib import Path
|
2
2
|
import warnings
|
3
|
-
|
4
3
|
warnings.filterwarnings("ignore")
|
5
4
|
|
6
5
|
import os
|
@@ -13,6 +12,7 @@ from plotly import express as px
|
|
13
12
|
import plotly.graph_objects as go
|
14
13
|
from plotly.subplots import make_subplots
|
15
14
|
import plotly # needed for return types
|
15
|
+
import re
|
16
16
|
|
17
17
|
from .hlp import *
|
18
18
|
from .tbl import print_summary
|
@@ -1437,6 +1437,229 @@ def plot_facet_stacked_bars(
|
|
1437
1437
|
|
1438
1438
|
return fig
|
1439
1439
|
|
1440
|
+
|
1441
|
+
def plot_sankey(df=None, max_events_per_id=None, height=None, width=None, exclude_overlap_id=False, exclude_overlap_event=False, renderer=None, show_start_node=True):
|
1442
|
+
"""
|
1443
|
+
Generates a Sankey diagram from a Pandas DataFrame, assuming the column order is:
|
1444
|
+
1. ID (string or integer)
|
1445
|
+
2. Date (date, datetime, or string convertible to numeric)
|
1446
|
+
3. Event Name (string)
|
1447
|
+
|
1448
|
+
Nodes represent the order of events (e.g., "[1] op", "[2] syst").
|
1449
|
+
A default demo is shown if no DataFrame is provided.
|
1450
|
+
|
1451
|
+
Args:
|
1452
|
+
df (pd.DataFrame, optional): A Pandas DataFrame containing the event data.
|
1453
|
+
Expected column order: ID, Date, Event.
|
1454
|
+
max_events_per_id (int, optional): The maximum number of events to display for each ID.
|
1455
|
+
If None, all events for each ID will be used.
|
1456
|
+
height (int, optional): The height of the plot in pixels.
|
1457
|
+
width (int, optional): The width of the plot in pixels.
|
1458
|
+
exclude_overlap_id (bool): If True, excludes any IDs that have multiple events on the same date.
|
1459
|
+
This takes precedence over `exclude_overlap_event`.
|
1460
|
+
exclude_overlap_event (bool): If True, only excludes the specific events that fall on the same date,
|
1461
|
+
retaining other non-overlapping events for that ID.
|
1462
|
+
renderer (str, optional): The renderer to use for displaying the plot. Options include
|
1463
|
+
'browser', 'notebook', 'json', 'png', 'svg', 'jpeg', 'webp', or 'pdf'.
|
1464
|
+
If None, plotly's default renderer is used.
|
1465
|
+
show_start_node (bool): If True, adds a visual 'start' node and links all
|
1466
|
+
first events to it. This is useful for visualizing
|
1467
|
+
IDs with only one event.
|
1468
|
+
"""
|
1469
|
+
# --- Example Usage with Enlarged Pandas DataFrame if no DataFrame is provided ---
|
1470
|
+
if df is None:
|
1471
|
+
data_demo = { # Renamed to data_demo for clarity
|
1472
|
+
'tumor-id': [
|
1473
|
+
'1', '1', '1', '1', '1',
|
1474
|
+
'2', '2', '2', '2',
|
1475
|
+
'3', '3', '3', '3',
|
1476
|
+
'4', '4', '4',
|
1477
|
+
'5', '5',
|
1478
|
+
'6', '6',
|
1479
|
+
'7', '7',
|
1480
|
+
'8',
|
1481
|
+
'9',
|
1482
|
+
'10',
|
1483
|
+
'11',
|
1484
|
+
'12'
|
1485
|
+
],
|
1486
|
+
'diagnosis date': [
|
1487
|
+
'2020-01-01', '2021-02-01', '2022-03-01', '2023-04-01', '2024-05-01', # Tumor 1
|
1488
|
+
'2010-01-01', '2011-02-01', '2012-03-01', '2013-04-01', # Tumor 2
|
1489
|
+
'2015-01-01', '2016-02-01', '2017-03-01', '2018-04-01', # Tumor 3
|
1490
|
+
'2005-01-01', '2006-02-01', '2007-03-01', # Tumor 4
|
1491
|
+
'2019-01-01', '2020-02-01', # Tumor 5
|
1492
|
+
'2021-01-01', '2022-02-01', # Tumor 6
|
1493
|
+
'2014-01-01', '2015-02-01', # Tumor 7
|
1494
|
+
'2025-01-01', # Tumor 8 (single event)
|
1495
|
+
'2025-02-01', # Tumor 9 (single event)
|
1496
|
+
'2025-03-01', # Tumor 10 (single event)
|
1497
|
+
'2025-04-01', # Tumor 11 (single event)
|
1498
|
+
'2025-05-01' # Tumor 12 (single event)
|
1499
|
+
],
|
1500
|
+
'treatment': [
|
1501
|
+
'op', 'syst', 'op', 'rad', 'op', # Tumor 1
|
1502
|
+
'syst', 'st', 'op', 'rad', # Tumor 2
|
1503
|
+
'op', 'rad', 'syst', 'op', # Tumor 3
|
1504
|
+
'st', 'syst', 'op', # Tumor 4
|
1505
|
+
'op', 'rad', # Tumor 5
|
1506
|
+
'syst', 'op', # Tumor 6
|
1507
|
+
'st', 'rad', # Tumor 7
|
1508
|
+
'op', # Tumor 8
|
1509
|
+
'op', # Tumor 9
|
1510
|
+
'syst', # Tumor 10
|
1511
|
+
'rad', # Tumor 11
|
1512
|
+
'op' # Tumor 12
|
1513
|
+
]
|
1514
|
+
}
|
1515
|
+
df = pd.DataFrame(data_demo)
|
1516
|
+
print("--- Using demo data (data_demo) ---")
|
1517
|
+
print(df.head().to_string()) # Print first 5 rows of the DataFrame prettily
|
1518
|
+
print("-----------------------------------")
|
1519
|
+
|
1520
|
+
# --- Simplified Column Recognition based on index ---
|
1521
|
+
id_col_name = df.columns[0]
|
1522
|
+
date_col_name = df.columns[1]
|
1523
|
+
event_col_name = df.columns[2]
|
1524
|
+
|
1525
|
+
df_processed = df.copy()
|
1526
|
+
|
1527
|
+
# --- Aggregate the data to remove duplicate rows before processing ---
|
1528
|
+
df_processed = df_processed.drop_duplicates(subset=[id_col_name, date_col_name, event_col_name])
|
1529
|
+
|
1530
|
+
try:
|
1531
|
+
df_processed[date_col_name] = pd.to_datetime(df_processed[date_col_name])
|
1532
|
+
except (ValueError, TypeError):
|
1533
|
+
print(f"Error: Could not convert column '{date_col_name}' to a valid date format.")
|
1534
|
+
return None
|
1535
|
+
|
1536
|
+
# --- Handle overlap exclusion based on user selection ---
|
1537
|
+
overlap_title_part = ""
|
1538
|
+
if exclude_overlap_id:
|
1539
|
+
overlapping_ids = df_processed.groupby([id_col_name, date_col_name]).size().loc[lambda x: x > 1].index.get_level_values(id_col_name).unique()
|
1540
|
+
df_processed = df_processed[~df_processed[id_col_name].isin(overlapping_ids)].copy()
|
1541
|
+
overlap_title_part = ", overlap ids excluded"
|
1542
|
+
elif exclude_overlap_event:
|
1543
|
+
overlapping_event_set = set(df_processed.groupby([id_col_name, date_col_name]).size().loc[lambda x: x > 1].index)
|
1544
|
+
df_processed = df_processed[~df_processed.set_index([id_col_name, date_col_name]).index.isin(overlapping_event_set)].copy()
|
1545
|
+
overlap_title_part = ", overlap events excluded"
|
1546
|
+
|
1547
|
+
df_sorted = df_processed.sort_values(by=[id_col_name, date_col_name])
|
1548
|
+
|
1549
|
+
# --- Performance Optimization: Use vectorized operations instead of loops ---
|
1550
|
+
df_sorted['event_order'] = df_sorted.groupby(id_col_name).cumcount() + 1
|
1551
|
+
|
1552
|
+
if max_events_per_id is not None:
|
1553
|
+
df_sorted = df_sorted[df_sorted['event_order'] <= max_events_per_id]
|
1554
|
+
|
1555
|
+
df_sorted['ordered_event_label'] = '[' + df_sorted['event_order'].astype(str) + '] ' + df_sorted[event_col_name]
|
1556
|
+
|
1557
|
+
if df_sorted.empty:
|
1558
|
+
print("No valid data to plot after filtering.")
|
1559
|
+
return None
|
1560
|
+
|
1561
|
+
# Use a vectorized shift operation to create source and target columns
|
1562
|
+
df_sorted['source_label'] = df_sorted.groupby(id_col_name)['ordered_event_label'].shift(1)
|
1563
|
+
df_with_links = df_sorted.dropna(subset=['source_label']).copy()
|
1564
|
+
|
1565
|
+
# Create the start node and links if enabled
|
1566
|
+
if show_start_node:
|
1567
|
+
first_events = df_sorted.groupby(id_col_name).first().reset_index()
|
1568
|
+
first_events['source_label'] = "[0] start"
|
1569
|
+
df_with_links = pd.concat([first_events[['source_label', 'ordered_event_label']], df_with_links[['source_label', 'ordered_event_label']]], ignore_index=True)
|
1570
|
+
|
1571
|
+
link_counts = df_with_links.groupby(['source_label', 'ordered_event_label']).size().reset_index(name='value')
|
1572
|
+
|
1573
|
+
# Get all unique nodes for the labels and sorting
|
1574
|
+
all_labels = pd.concat([link_counts['source_label'], link_counts['ordered_event_label']]).unique()
|
1575
|
+
unique_labels_df = pd.DataFrame(all_labels, columns=['label'])
|
1576
|
+
unique_labels_df['event_order_num'] = unique_labels_df['label'].str.extract(r'\[(\d+)\]').astype(float).fillna(0)
|
1577
|
+
unique_labels_df['event_name'] = unique_labels_df['label'].str.extract(r'\] (.*)').fillna('start')
|
1578
|
+
unique_labels_df_sorted = unique_labels_df.sort_values(by=['event_order_num', 'event_name'])
|
1579
|
+
unique_unformatted_labels_sorted = unique_labels_df_sorted['label'].tolist()
|
1580
|
+
|
1581
|
+
label_to_index = {label: i for i, label in enumerate(unique_unformatted_labels_sorted)}
|
1582
|
+
|
1583
|
+
# Calculate total unique IDs for percentage calculation
|
1584
|
+
total_unique_ids = df_processed[id_col_name].nunique()
|
1585
|
+
|
1586
|
+
display_labels = []
|
1587
|
+
node_counts = df_sorted['ordered_event_label'].value_counts()
|
1588
|
+
for label in unique_unformatted_labels_sorted:
|
1589
|
+
if label == "[0] start":
|
1590
|
+
count = total_unique_ids
|
1591
|
+
else:
|
1592
|
+
count = node_counts.get(label, 0)
|
1593
|
+
|
1594
|
+
percentage = (count / total_unique_ids) * 100
|
1595
|
+
formatted_count = f"{count:,}".replace(',', '_')
|
1596
|
+
formatted_percentage = f"({int(round(percentage, 0))}%)"
|
1597
|
+
|
1598
|
+
display_labels.append(f"{label} {formatted_count} {formatted_percentage}")
|
1599
|
+
|
1600
|
+
# Map sources and targets to indices
|
1601
|
+
sources = link_counts['source_label'].map(label_to_index).tolist()
|
1602
|
+
targets = link_counts['ordered_event_label'].map(label_to_index).tolist()
|
1603
|
+
values = link_counts['value'].tolist()
|
1604
|
+
|
1605
|
+
# Define a color palette for links
|
1606
|
+
color_palette = [
|
1607
|
+
"rgba(255, 99, 71, 0.6)", "rgba(60, 179, 113, 0.6)", "rgba(65, 105, 225, 0.6)",
|
1608
|
+
"rgba(255, 215, 0, 0.6)", "rgba(147, 112, 219, 0.6)", "rgba(0, 206, 209, 0.6)",
|
1609
|
+
"rgba(255, 160, 122, 0.6)", "rgba(124, 252, 0, 0.6)", "rgba(30, 144, 255, 0.6)",
|
1610
|
+
"rgba(218, 165, 32, 0.6)"
|
1611
|
+
]
|
1612
|
+
start_link_color = "rgba(128, 128, 128, 0.6)"
|
1613
|
+
|
1614
|
+
link_colors = []
|
1615
|
+
link_type_to_color = {}
|
1616
|
+
color_index = 0
|
1617
|
+
for i, row in link_counts.iterrows():
|
1618
|
+
source_l = row['source_label']
|
1619
|
+
target_l = row['ordered_event_label']
|
1620
|
+
if source_l == "[0] start":
|
1621
|
+
link_colors.append(start_link_color)
|
1622
|
+
else:
|
1623
|
+
source_event_name = re.search(r'\] (.*)', source_l).group(1)
|
1624
|
+
target_event_name = re.search(r'\] (.*)', target_l).group(1)
|
1625
|
+
link_type = (source_event_name, target_event_name)
|
1626
|
+
|
1627
|
+
if link_type not in link_type_to_color:
|
1628
|
+
link_type_to_color[link_type] = color_palette[color_index % len(color_palette)]
|
1629
|
+
color_index += 1
|
1630
|
+
link_colors.append(link_type_to_color[link_type])
|
1631
|
+
|
1632
|
+
formatted_total_ids = f"{total_unique_ids:,}".replace(',', '_')
|
1633
|
+
total_rows = len(df_processed)
|
1634
|
+
formatted_total_rows = f"{total_rows:,}".replace(',', '_')
|
1635
|
+
|
1636
|
+
chart_title = f"[{id_col_name}] over [{event_col_name}]"
|
1637
|
+
if max_events_per_id is not None:
|
1638
|
+
chart_title += f", top {max_events_per_id} events"
|
1639
|
+
chart_title += overlap_title_part
|
1640
|
+
chart_title += f", n = {formatted_total_ids} ({formatted_total_rows})"
|
1641
|
+
|
1642
|
+
fig = go.Figure(data=[go.Sankey(
|
1643
|
+
node=dict(
|
1644
|
+
pad=15,
|
1645
|
+
thickness=20,
|
1646
|
+
line=dict(color="black", width=0.5),
|
1647
|
+
label=display_labels,
|
1648
|
+
color="blue",
|
1649
|
+
align="left"
|
1650
|
+
),
|
1651
|
+
link=dict(
|
1652
|
+
source=sources,
|
1653
|
+
target=targets,
|
1654
|
+
value=values,
|
1655
|
+
color=link_colors
|
1656
|
+
)
|
1657
|
+
)])
|
1658
|
+
|
1659
|
+
fig.update_layout(title_text=chart_title, font_size=10, height=height, width=width)
|
1660
|
+
fig.show(renderer=renderer)
|
1661
|
+
|
1662
|
+
|
1440
1663
|
# * extend objects to enable chaining
|
1441
1664
|
pd.DataFrame.plot_bars = plot_bars
|
1442
1665
|
pd.DataFrame.plot_stacked_bars = plot_stacked_bars
|
@@ -1445,4 +1668,5 @@ pd.DataFrame.plot_stacked_box = plot_box
|
|
1445
1668
|
pd.DataFrame.plot_stacked_boxes = plot_boxes
|
1446
1669
|
pd.DataFrame.plot_quadrants = plot_quadrants
|
1447
1670
|
pd.DataFrame.plot_histogram = plot_histogram
|
1448
|
-
pd.DataFrame.plot_joint = plot_joint
|
1671
|
+
pd.DataFrame.plot_joint = plot_joint
|
1672
|
+
pd.DataFrame.plot_sankey = plot_sankey
|
pandas_plots/tbl.py
CHANGED
@@ -172,6 +172,9 @@ def describe_df(
|
|
172
172
|
# * fix bug(?) in plotly/choreographer - datetime columns are not plotted, set these to str
|
173
173
|
datetime_cols = df.select_dtypes(include=['datetime64']).columns
|
174
174
|
df[datetime_cols] = df[datetime_cols].astype(str)
|
175
|
+
|
176
|
+
# ! Drop completely empty columns (Series)
|
177
|
+
df = df.dropna(axis=1, how='all')
|
175
178
|
|
176
179
|
# * reduce column names len if selected
|
177
180
|
if top_n_chars_in_columns > 0:
|
@@ -0,0 +1,10 @@
|
|
1
|
+
pandas_plots/hlp.py,sha256=z8rrVNbH9qMohdXPT-FksP-VkTOjI0bGFj47Sw5p3aY,21141
|
2
|
+
pandas_plots/pls.py,sha256=80uXr3bT66LGjDcuT4a0ewCBwATcOUZ3QQ228Hn9glY,60052
|
3
|
+
pandas_plots/tbl.py,sha256=R2E6FLhxNpUtS88Zf88Eh9i8dSKgmJtmFimFvOt0foQ,32780
|
4
|
+
pandas_plots/ven.py,sha256=2x3ACo2vSfO3q6fv-UdDQ0h1SJyt8WChBGgE5SDCdCk,11673
|
5
|
+
pandas_plots-0.13.0.dist-info/licenses/LICENSE,sha256=ltLbQWUCs-GBQlTPXbt5nHNBE9U5LzjjoS1Y8hHETM4,1051
|
6
|
+
pandas_plots-0.13.0.dist-info/METADATA,sha256=G7Vx-tY6PgbcchOatYf8lr1vA62k-ik3Zx3dCSLhWqM,7430
|
7
|
+
pandas_plots-0.13.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
8
|
+
pandas_plots-0.13.0.dist-info/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
|
9
|
+
pandas_plots-0.13.0.dist-info/top_level.txt,sha256=XnaNuIHBqMmCeh_U7nKOYTwFue_SIA0wxuDgdPmnnSk,13
|
10
|
+
pandas_plots-0.13.0.dist-info/RECORD,,
|
@@ -1,10 +0,0 @@
|
|
1
|
-
pandas_plots/hlp.py,sha256=z8rrVNbH9qMohdXPT-FksP-VkTOjI0bGFj47Sw5p3aY,21141
|
2
|
-
pandas_plots/pls.py,sha256=M-UrYcgQHWUeiuqjrq2TNJHca5cHfvS5pEp66Qu2Nrs,48886
|
3
|
-
pandas_plots/tbl.py,sha256=gjBl84mZyx13L0lRo0dztSvo_Gs3FrouklUhTQ5lIIk,32678
|
4
|
-
pandas_plots/ven.py,sha256=2x3ACo2vSfO3q6fv-UdDQ0h1SJyt8WChBGgE5SDCdCk,11673
|
5
|
-
pandas_plots-0.12.29.dist-info/licenses/LICENSE,sha256=ltLbQWUCs-GBQlTPXbt5nHNBE9U5LzjjoS1Y8hHETM4,1051
|
6
|
-
pandas_plots-0.12.29.dist-info/METADATA,sha256=w8gUfQLGOX7JencPD6w-1l_HmZJ1TYvBAgqrLNEvG1s,7431
|
7
|
-
pandas_plots-0.12.29.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
8
|
-
pandas_plots-0.12.29.dist-info/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
|
9
|
-
pandas_plots-0.12.29.dist-info/top_level.txt,sha256=XnaNuIHBqMmCeh_U7nKOYTwFue_SIA0wxuDgdPmnnSk,13
|
10
|
-
pandas_plots-0.12.29.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|