pandas-plots 0.12.30__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
pandas_plots/pls.py CHANGED
@@ -1,6 +1,5 @@
1
1
  from pathlib import Path
2
2
  import warnings
3
-
4
3
  warnings.filterwarnings("ignore")
5
4
 
6
5
  import os
@@ -13,6 +12,7 @@ from plotly import express as px
13
12
  import plotly.graph_objects as go
14
13
  from plotly.subplots import make_subplots
15
14
  import plotly # needed for return types
15
+ import re
16
16
 
17
17
  from .hlp import *
18
18
  from .tbl import print_summary
@@ -1437,6 +1437,229 @@ def plot_facet_stacked_bars(
1437
1437
 
1438
1438
  return fig
1439
1439
 
1440
+
1441
+ def plot_sankey(df=None, max_events_per_id=None, height=None, width=None, exclude_overlap_id=False, exclude_overlap_event=False, renderer=None, show_start_node=True):
1442
+ """
1443
+ Generates a Sankey diagram from a Pandas DataFrame, assuming the column order is:
1444
+ 1. ID (string or integer)
1445
+ 2. Date (date, datetime, or string convertible to numeric)
1446
+ 3. Event Name (string)
1447
+
1448
+ Nodes represent the order of events (e.g., "[1] op", "[2] syst").
1449
+ A default demo is shown if no DataFrame is provided.
1450
+
1451
+ Args:
1452
+ df (pd.DataFrame, optional): A Pandas DataFrame containing the event data.
1453
+ Expected column order: ID, Date, Event.
1454
+ max_events_per_id (int, optional): The maximum number of events to display for each ID.
1455
+ If None, all events for each ID will be used.
1456
+ height (int, optional): The height of the plot in pixels.
1457
+ width (int, optional): The width of the plot in pixels.
1458
+ exclude_overlap_id (bool): If True, excludes any IDs that have multiple events on the same date.
1459
+ This takes precedence over `exclude_overlap_event`.
1460
+ exclude_overlap_event (bool): If True, only excludes the specific events that fall on the same date,
1461
+ retaining other non-overlapping events for that ID.
1462
+ renderer (str, optional): The renderer to use for displaying the plot. Options include
1463
+ 'browser', 'notebook', 'json', 'png', 'svg', 'jpeg', 'webp', or 'pdf'.
1464
+ If None, plotly's default renderer is used.
1465
+ show_start_node (bool): If True, adds a visual 'start' node and links all
1466
+ first events to it. This is useful for visualizing
1467
+ IDs with only one event.
1468
+ """
1469
+ # --- Example Usage with Enlarged Pandas DataFrame if no DataFrame is provided ---
1470
+ if df is None:
1471
+ data_demo = { # Renamed to data_demo for clarity
1472
+ 'tumor-id': [
1473
+ '1', '1', '1', '1', '1',
1474
+ '2', '2', '2', '2',
1475
+ '3', '3', '3', '3',
1476
+ '4', '4', '4',
1477
+ '5', '5',
1478
+ '6', '6',
1479
+ '7', '7',
1480
+ '8',
1481
+ '9',
1482
+ '10',
1483
+ '11',
1484
+ '12'
1485
+ ],
1486
+ 'diagnosis date': [
1487
+ '2020-01-01', '2021-02-01', '2022-03-01', '2023-04-01', '2024-05-01', # Tumor 1
1488
+ '2010-01-01', '2011-02-01', '2012-03-01', '2013-04-01', # Tumor 2
1489
+ '2015-01-01', '2016-02-01', '2017-03-01', '2018-04-01', # Tumor 3
1490
+ '2005-01-01', '2006-02-01', '2007-03-01', # Tumor 4
1491
+ '2019-01-01', '2020-02-01', # Tumor 5
1492
+ '2021-01-01', '2022-02-01', # Tumor 6
1493
+ '2014-01-01', '2015-02-01', # Tumor 7
1494
+ '2025-01-01', # Tumor 8 (single event)
1495
+ '2025-02-01', # Tumor 9 (single event)
1496
+ '2025-03-01', # Tumor 10 (single event)
1497
+ '2025-04-01', # Tumor 11 (single event)
1498
+ '2025-05-01' # Tumor 12 (single event)
1499
+ ],
1500
+ 'treatment': [
1501
+ 'op', 'syst', 'op', 'rad', 'op', # Tumor 1
1502
+ 'syst', 'st', 'op', 'rad', # Tumor 2
1503
+ 'op', 'rad', 'syst', 'op', # Tumor 3
1504
+ 'st', 'syst', 'op', # Tumor 4
1505
+ 'op', 'rad', # Tumor 5
1506
+ 'syst', 'op', # Tumor 6
1507
+ 'st', 'rad', # Tumor 7
1508
+ 'op', # Tumor 8
1509
+ 'op', # Tumor 9
1510
+ 'syst', # Tumor 10
1511
+ 'rad', # Tumor 11
1512
+ 'op' # Tumor 12
1513
+ ]
1514
+ }
1515
+ df = pd.DataFrame(data_demo)
1516
+ print("--- Using demo data (data_demo) ---")
1517
+ print(df.head().to_string()) # Print first 5 rows of the DataFrame prettily
1518
+ print("-----------------------------------")
1519
+
1520
+ # --- Simplified Column Recognition based on index ---
1521
+ id_col_name = df.columns[0]
1522
+ date_col_name = df.columns[1]
1523
+ event_col_name = df.columns[2]
1524
+
1525
+ df_processed = df.copy()
1526
+
1527
+ # --- Aggregate the data to remove duplicate rows before processing ---
1528
+ df_processed = df_processed.drop_duplicates(subset=[id_col_name, date_col_name, event_col_name])
1529
+
1530
+ try:
1531
+ df_processed[date_col_name] = pd.to_datetime(df_processed[date_col_name])
1532
+ except (ValueError, TypeError):
1533
+ print(f"Error: Could not convert column '{date_col_name}' to a valid date format.")
1534
+ return None
1535
+
1536
+ # --- Handle overlap exclusion based on user selection ---
1537
+ overlap_title_part = ""
1538
+ if exclude_overlap_id:
1539
+ overlapping_ids = df_processed.groupby([id_col_name, date_col_name]).size().loc[lambda x: x > 1].index.get_level_values(id_col_name).unique()
1540
+ df_processed = df_processed[~df_processed[id_col_name].isin(overlapping_ids)].copy()
1541
+ overlap_title_part = ", overlap ids excluded"
1542
+ elif exclude_overlap_event:
1543
+ overlapping_event_set = set(df_processed.groupby([id_col_name, date_col_name]).size().loc[lambda x: x > 1].index)
1544
+ df_processed = df_processed[~df_processed.set_index([id_col_name, date_col_name]).index.isin(overlapping_event_set)].copy()
1545
+ overlap_title_part = ", overlap events excluded"
1546
+
1547
+ df_sorted = df_processed.sort_values(by=[id_col_name, date_col_name])
1548
+
1549
+ # --- Performance Optimization: Use vectorized operations instead of loops ---
1550
+ df_sorted['event_order'] = df_sorted.groupby(id_col_name).cumcount() + 1
1551
+
1552
+ if max_events_per_id is not None:
1553
+ df_sorted = df_sorted[df_sorted['event_order'] <= max_events_per_id]
1554
+
1555
+ df_sorted['ordered_event_label'] = '[' + df_sorted['event_order'].astype(str) + '] ' + df_sorted[event_col_name]
1556
+
1557
+ if df_sorted.empty:
1558
+ print("No valid data to plot after filtering.")
1559
+ return None
1560
+
1561
+ # Use a vectorized shift operation to create source and target columns
1562
+ df_sorted['source_label'] = df_sorted.groupby(id_col_name)['ordered_event_label'].shift(1)
1563
+ df_with_links = df_sorted.dropna(subset=['source_label']).copy()
1564
+
1565
+ # Create the start node and links if enabled
1566
+ if show_start_node:
1567
+ first_events = df_sorted.groupby(id_col_name).first().reset_index()
1568
+ first_events['source_label'] = "[0] start"
1569
+ df_with_links = pd.concat([first_events[['source_label', 'ordered_event_label']], df_with_links[['source_label', 'ordered_event_label']]], ignore_index=True)
1570
+
1571
+ link_counts = df_with_links.groupby(['source_label', 'ordered_event_label']).size().reset_index(name='value')
1572
+
1573
+ # Get all unique nodes for the labels and sorting
1574
+ all_labels = pd.concat([link_counts['source_label'], link_counts['ordered_event_label']]).unique()
1575
+ unique_labels_df = pd.DataFrame(all_labels, columns=['label'])
1576
+ unique_labels_df['event_order_num'] = unique_labels_df['label'].str.extract(r'\[(\d+)\]').astype(float).fillna(0)
1577
+ unique_labels_df['event_name'] = unique_labels_df['label'].str.extract(r'\] (.*)').fillna('start')
1578
+ unique_labels_df_sorted = unique_labels_df.sort_values(by=['event_order_num', 'event_name'])
1579
+ unique_unformatted_labels_sorted = unique_labels_df_sorted['label'].tolist()
1580
+
1581
+ label_to_index = {label: i for i, label in enumerate(unique_unformatted_labels_sorted)}
1582
+
1583
+ # Calculate total unique IDs for percentage calculation
1584
+ total_unique_ids = df_processed[id_col_name].nunique()
1585
+
1586
+ display_labels = []
1587
+ node_counts = df_sorted['ordered_event_label'].value_counts()
1588
+ for label in unique_unformatted_labels_sorted:
1589
+ if label == "[0] start":
1590
+ count = total_unique_ids
1591
+ else:
1592
+ count = node_counts.get(label, 0)
1593
+
1594
+ percentage = (count / total_unique_ids) * 100
1595
+ formatted_count = f"{count:,}".replace(',', '_')
1596
+ formatted_percentage = f"({int(round(percentage, 0))}%)"
1597
+
1598
+ display_labels.append(f"{label} {formatted_count} {formatted_percentage}")
1599
+
1600
+ # Map sources and targets to indices
1601
+ sources = link_counts['source_label'].map(label_to_index).tolist()
1602
+ targets = link_counts['ordered_event_label'].map(label_to_index).tolist()
1603
+ values = link_counts['value'].tolist()
1604
+
1605
+ # Define a color palette for links
1606
+ color_palette = [
1607
+ "rgba(255, 99, 71, 0.6)", "rgba(60, 179, 113, 0.6)", "rgba(65, 105, 225, 0.6)",
1608
+ "rgba(255, 215, 0, 0.6)", "rgba(147, 112, 219, 0.6)", "rgba(0, 206, 209, 0.6)",
1609
+ "rgba(255, 160, 122, 0.6)", "rgba(124, 252, 0, 0.6)", "rgba(30, 144, 255, 0.6)",
1610
+ "rgba(218, 165, 32, 0.6)"
1611
+ ]
1612
+ start_link_color = "rgba(128, 128, 128, 0.6)"
1613
+
1614
+ link_colors = []
1615
+ link_type_to_color = {}
1616
+ color_index = 0
1617
+ for i, row in link_counts.iterrows():
1618
+ source_l = row['source_label']
1619
+ target_l = row['ordered_event_label']
1620
+ if source_l == "[0] start":
1621
+ link_colors.append(start_link_color)
1622
+ else:
1623
+ source_event_name = re.search(r'\] (.*)', source_l).group(1)
1624
+ target_event_name = re.search(r'\] (.*)', target_l).group(1)
1625
+ link_type = (source_event_name, target_event_name)
1626
+
1627
+ if link_type not in link_type_to_color:
1628
+ link_type_to_color[link_type] = color_palette[color_index % len(color_palette)]
1629
+ color_index += 1
1630
+ link_colors.append(link_type_to_color[link_type])
1631
+
1632
+ formatted_total_ids = f"{total_unique_ids:,}".replace(',', '_')
1633
+ total_rows = len(df_processed)
1634
+ formatted_total_rows = f"{total_rows:,}".replace(',', '_')
1635
+
1636
+ chart_title = f"[{id_col_name}] over [{event_col_name}]"
1637
+ if max_events_per_id is not None:
1638
+ chart_title += f", top {max_events_per_id} events"
1639
+ chart_title += overlap_title_part
1640
+ chart_title += f", n = {formatted_total_ids} ({formatted_total_rows})"
1641
+
1642
+ fig = go.Figure(data=[go.Sankey(
1643
+ node=dict(
1644
+ pad=15,
1645
+ thickness=20,
1646
+ line=dict(color="black", width=0.5),
1647
+ label=display_labels,
1648
+ color="blue",
1649
+ align="left"
1650
+ ),
1651
+ link=dict(
1652
+ source=sources,
1653
+ target=targets,
1654
+ value=values,
1655
+ color=link_colors
1656
+ )
1657
+ )])
1658
+
1659
+ fig.update_layout(title_text=chart_title, font_size=10, height=height, width=width)
1660
+ fig.show(renderer=renderer)
1661
+
1662
+
1440
1663
  # * extend objects to enable chaining
1441
1664
  pd.DataFrame.plot_bars = plot_bars
1442
1665
  pd.DataFrame.plot_stacked_bars = plot_stacked_bars
@@ -1445,4 +1668,5 @@ pd.DataFrame.plot_stacked_box = plot_box
1445
1668
  pd.DataFrame.plot_stacked_boxes = plot_boxes
1446
1669
  pd.DataFrame.plot_quadrants = plot_quadrants
1447
1670
  pd.DataFrame.plot_histogram = plot_histogram
1448
- pd.DataFrame.plot_joint = plot_joint
1671
+ pd.DataFrame.plot_joint = plot_joint
1672
+ pd.DataFrame.plot_sankey = plot_sankey
@@ -1,38 +1,34 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pandas-plots
3
- Version: 0.12.30
3
+ Version: 0.14.0
4
4
  Summary: A collection of helper for table handling and visualization
5
- Home-page: https://github.com/smeisegeier/pandas-plots
6
- Author: smeisegeier
7
- Author-email: dexterDSDo@googlemail.com
8
- License: MIT
9
- Project-URL: Documentation, https://github.com/smeisegeier/pandas-plots
10
- Project-URL: Source Code, https://github.com/smeisegeier/pandas-plots
5
+ Project-URL: Homepage, https://github.com/smeisegeier/pandas-plots
6
+ Project-URL: Repository, https://github.com/smeisegeier/pandas-plots
11
7
  Project-URL: Bug Tracker, https://github.com/smeisegeier/pandas-plots/issues
12
- Classifier: License :: OSI Approved :: MIT License
13
- Classifier: Programming Language :: Python :: 3
14
- Classifier: Programming Language :: Python :: 3.10
8
+ Author-email: smeisegeier <meisegeiers@rki.de>
9
+ License-File: LICENSE
10
+ Keywords: pivot,plot,plotly,tables,venn,vizualization
15
11
  Classifier: Development Status :: 4 - Beta
16
12
  Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: MIT License
17
14
  Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
18
17
  Classifier: Topic :: Scientific/Engineering
19
18
  Requires-Python: >=3.10
20
- Description-Content-Type: text/markdown
21
- License-File: LICENSE
22
- Requires-Dist: pandas>=2.0.0
23
- Requires-Dist: plotly<6
24
- Requires-Dist: matplotlib>=3.8.2
19
+ Requires-Dist: dataframe-image>=0.2.6
20
+ Requires-Dist: duckdb>=1.3.0
21
+ Requires-Dist: jinja2>=3.1.4
25
22
  Requires-Dist: matplotlib-venn==0.11.10
26
- Requires-Dist: seaborn>=0.13.2
27
- Requires-Dist: Jinja2>=3.1.4
28
- Requires-Dist: requests>=2.32.0
29
- Requires-Dist: numpy<2.0.0
23
+ Requires-Dist: matplotlib>=3.8.2
30
24
  Requires-Dist: missingno>=0.5.2
31
- Requires-Dist: duckdb>=1.0.0
32
- Requires-Dist: kaleido>=0.2.0
33
25
  Requires-Dist: nbformat>=4.2.0
34
- Requires-Dist: dataframe_image>=0.2.6
35
- Dynamic: license-file
26
+ Requires-Dist: numpy<2.0.0
27
+ Requires-Dist: pandas>=2.0.0
28
+ Requires-Dist: plotly>=6.2
29
+ Requires-Dist: requests>=2.32.0
30
+ Requires-Dist: seaborn>=0.13.2
31
+ Description-Content-Type: text/markdown
36
32
 
37
33
  # pandas-plots
38
34
 
@@ -98,6 +94,7 @@ tbl.show_num_df(
98
94
  - `plot_joints()` a joint plot for **exactly two numerical** columns
99
95
  - `plot_quadrants()` quickly shows a 2x2 heatmap
100
96
  - `plot_facet_stacked_bars()` shows stacked bars for a facet value as subplots
97
+ - `plot_sankey()` generates a Sankey diagram
101
98
  <br>
102
99
 
103
100
  - `ven` offers functions for _venn diagrams_
@@ -175,4 +172,4 @@ _df, _details = ven.show_venn3(
175
172
 
176
173
  ## tags
177
174
 
178
- #pandas, #plotly, #visualizations, #statistics
175
+ #pandas, #plotly, #visualizations, #statistics
@@ -0,0 +1,9 @@
1
+ pandas_plots/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ pandas_plots/hlp.py,sha256=z8rrVNbH9qMohdXPT-FksP-VkTOjI0bGFj47Sw5p3aY,21141
3
+ pandas_plots/pls.py,sha256=80uXr3bT66LGjDcuT4a0ewCBwATcOUZ3QQ228Hn9glY,60052
4
+ pandas_plots/tbl.py,sha256=R2E6FLhxNpUtS88Zf88Eh9i8dSKgmJtmFimFvOt0foQ,32780
5
+ pandas_plots/ven.py,sha256=2x3ACo2vSfO3q6fv-UdDQ0h1SJyt8WChBGgE5SDCdCk,11673
6
+ pandas_plots-0.14.0.dist-info/METADATA,sha256=tw4QxZ9io1c9MgSESxsrGHdKXqoTr9-xNfOpV5hxfUo,7394
7
+ pandas_plots-0.14.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
8
+ pandas_plots-0.14.0.dist-info/licenses/LICENSE,sha256=ltLbQWUCs-GBQlTPXbt5nHNBE9U5LzjjoS1Y8hHETM4,1051
9
+ pandas_plots-0.14.0.dist-info/RECORD,,
@@ -1,5 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: hatchling 1.27.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
-
@@ -1,10 +0,0 @@
1
- pandas_plots/hlp.py,sha256=z8rrVNbH9qMohdXPT-FksP-VkTOjI0bGFj47Sw5p3aY,21141
2
- pandas_plots/pls.py,sha256=M-UrYcgQHWUeiuqjrq2TNJHca5cHfvS5pEp66Qu2Nrs,48886
3
- pandas_plots/tbl.py,sha256=R2E6FLhxNpUtS88Zf88Eh9i8dSKgmJtmFimFvOt0foQ,32780
4
- pandas_plots/ven.py,sha256=2x3ACo2vSfO3q6fv-UdDQ0h1SJyt8WChBGgE5SDCdCk,11673
5
- pandas_plots-0.12.30.dist-info/licenses/LICENSE,sha256=ltLbQWUCs-GBQlTPXbt5nHNBE9U5LzjjoS1Y8hHETM4,1051
6
- pandas_plots-0.12.30.dist-info/METADATA,sha256=vI58okMF7hvV_OuE7dRDQxKyP_W7umWL6_PntoMK2L0,7431
7
- pandas_plots-0.12.30.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
8
- pandas_plots-0.12.30.dist-info/pii.py,sha256=2WKE-W9s285jPdsTqCgt1uxuW4lj1PYCVOYB2fYDNwQ,2195
9
- pandas_plots-0.12.30.dist-info/top_level.txt,sha256=XnaNuIHBqMmCeh_U7nKOYTwFue_SIA0wxuDgdPmnnSk,13
10
- pandas_plots-0.12.30.dist-info/RECORD,,
@@ -1,76 +0,0 @@
1
- import pandas as pd
2
- import re
3
-
4
-
5
- def remove_pii(
6
- series: pd.Series,
7
- verbose: bool = True,
8
- logging: bool = False,
9
- custom_regex="",
10
- ) -> pd.Index:
11
- """
12
- Remove personally identifiable information (PII) from the given column.
13
-
14
- Parameters:
15
- - series: A pandas Series representing a column in a DataFrame.
16
- - verbose: If True, print pii items
17
- - logging: If True, write pii items into the file .pii.log
18
- - custom_regex: Regex that is injected into detection
19
-
20
- Returns:
21
- - index object with indexes of all pii items
22
-
23
- Remarks:
24
- - df.drop(axis=0, index=result, inplace=True)
25
- """
26
-
27
- # * reject empty columns
28
- assert len(series) > 0
29
-
30
- col = series.copy()
31
-
32
- # * na must be dropped to ensure processsing
33
- col.dropna(inplace=True)
34
-
35
- # * find terms
36
- _terms = frozenset(["lösch", "herr", "frau", "strasse", "klinik"])
37
- idx_terms = col[
38
- col.str.contains(
39
- "|".join(_terms),
40
- case=False,
41
- regex=True,
42
- )
43
- ].index
44
-
45
- # # * optional: search for terms in whole df
46
- # df.apply(lambda row: row.astype(str).str.contains('test', case=False, regex=True).any(), axis=1)
47
-
48
- # # * find dates
49
- ptr_date = r"\d{2}\.\d{2}\.\d{4}"
50
- idx_date = col[col.str.contains(ptr_date, regex=True)].index
51
-
52
- # * dr
53
- ptr_dr = r"[D|d][R|r]\. | Fr\. | Hr\. | PD "
54
- idx_dr = col[col.str.contains(ptr_dr, regex=True)].index
55
-
56
- # * custom
57
- idx_custom = (
58
- col[col.str.contains(custom_regex, regex=True)].index
59
- if custom_regex
60
- else pd.Index([])
61
- )
62
-
63
- idx_all = idx_terms.union(idx_date).union(idx_dr).union(idx_custom)
64
-
65
- if verbose:
66
- # print(f"found: {idx_dr.__len__()} dr | {idx_date.__len__()} date | {idx_terms.__len__()} terms")
67
- print(f"found {idx_all.__len__():_} pii items:")
68
- print(col.loc[idx_all].tolist())
69
-
70
- if logging: # Assuming logging is defined and has the correct value
71
- data = col.loc[idx_all] # Assuming col and idx_all are defined
72
- with open(".pii.log", "w") as f:
73
- # ! when using str(), it will give only a summary!
74
- f.write(data.to_string(index=True))
75
-
76
- return idx_all
@@ -1 +0,0 @@
1
- pandas_plots