PyPI - cadence-core - Versions diffs - 0.1.0__tar.gz - Mend

cadence-core 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

cadence_core-0.1.0/.github/workflows/publish.yml +40 -0
cadence_core-0.1.0/.gitignore +25 -0
cadence_core-0.1.0/LICENSE +21 -0
cadence_core-0.1.0/PKG-INFO +108 -0
cadence_core-0.1.0/README.md +73 -0
cadence_core-0.1.0/cadence/__init__.py +430 -0
cadence_core-0.1.0/cadence/clustering.py +168 -0
cadence_core-0.1.0/cadence/config.py +83 -0
cadence_core-0.1.0/cadence/data.py +171 -0
cadence_core-0.1.0/cadence/embeddings.py +130 -0
cadence_core-0.1.0/cadence/features.py +552 -0
cadence_core-0.1.0/cadence/model.py +209 -0
cadence_core-0.1.0/cadence/pretrained.py +198 -0
cadence_core-0.1.0/cadence/trainer.py +387 -0
cadence_core-0.1.0/examples/quickstart.py +110 -0
cadence_core-0.1.0/pyproject.toml +44 -0
cadence_core-0.1.0/tests/__init__.py +0 -0
cadence_core-0.1.0/tests/test_smoke.py +238 -0

cadence_core-0.1.0/.github/workflows/publish.yml ADDED Viewed

@@ -0,0 +1,40 @@
+name: Publish to PyPI
+on:
+  push:
+    tags:
+      - "v*"
+jobs:
+  build-and-publish:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+        with:
+          version: "latest"
+      - name: Install build dependencies
+        run: uv pip install --system hatchling build twine
+      - name: Build package
+        run: python -m build
+      - name: Check distribution
+        run: twine check dist/*
+      - name: Publish to PyPI
+        env:
+          TWINE_USERNAME: __token__
+          TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
+        run: twine upload dist/*

cadence_core-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,25 @@
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.venv/
+venv/
+.uv/
+build/
+dist/
+*.egg-info/
+.eggs/
+.pytest_cache/
+.ruff_cache/
+.mypy_cache/
+*.pt
+*.bin
+*.pth
+*.ckpt
+*.pkl
+local.env
+.env
+*.log
+.DS_Store
+Thumbs.db
+uv.lock

cadence_core-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Amir Rouhollahi
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

cadence_core-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,108 @@
+Metadata-Version: 2.4
+Name: cadence-core
+Version: 0.1.0
+Summary: Flat-MLP with PubMedBERT-enriched self-distillation for clinical next-event prediction
+Project-URL: Homepage, https://github.com/amirrouh/cadence
+Project-URL: Repository, https://github.com/amirrouh/cadence
+Project-URL: Issues, https://github.com/amirrouh/cadence/issues
+Author-email: Amir Rouhollahi <arouhollahi@bwh.harvard.edu>
+License: MIT
+License-File: LICENSE
+Keywords: clinical,ehr,healthcare-ml,next-event-prediction,pubmedbert
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
+Requires-Python: >=3.10
+Requires-Dist: huggingface-hub>=0.23
+Requires-Dist: numpy>=1.24
+Requires-Dist: pandas>=2.0
+Requires-Dist: scikit-learn>=1.3
+Requires-Dist: sentence-transformers>=2.7
+Requires-Dist: torch>=2.1
+Requires-Dist: tqdm>=4.66
+Requires-Dist: transformers>=4.40
+Provides-Extra: dev
+Requires-Dist: build; extra == 'dev'
+Requires-Dist: pytest>=7; extra == 'dev'
+Requires-Dist: ruff>=0.5; extra == 'dev'
+Requires-Dist: twine; extra == 'dev'
+Description-Content-Type: text/markdown
+# Cadence
+Clinical next-event prediction: a flat-MLP with PubMedBERT-enriched features and self-knowledge distillation, trained on EHR event sequences.
+## Install
+```bash
+pip install cadence-core
+```
+## Quickstart
+### Inference with a pretrained model
+```python
+from cadence import Cadence
+model = Cadence.from_pretrained("amirrouh/cadence-mimic-100k")
+next_event, days_until = model.predict(patient_events)
+```
+### Training on your own data
+```python
+from cadence import Cadence
+model = Cadence()
+model.fit(events_df)
+model.save("my-model/")
+```
+## Input data format
+`events_df` is a pandas DataFrame with the following columns:
+- `patient_id` — patient identifier (any hashable type)
+- `timestamp` — event time (datetime or ISO string; coerced via `pd.to_datetime`)
+- `event_text` — free-text event description (e.g. "Patient admitted with chest pain")
+- `cluster_id` — integer event cluster (optional; auto-assigned via sentence-transformers + KMeans if omitted)
+Example:
+| patient_id | timestamp           | event_text                          | cluster_id |
+|------------|---------------------|-------------------------------------|------------|
+| P001       | 2024-01-15 09:30    | Patient admitted with chest pain    | 3          |
+| P001       | 2024-01-15 11:45    | ECG performed, ST elevation         | 7          |
+| P002       | 2024-02-03 14:20    | Routine check-up, vitals normal     | 1          |
+`.predict(patient_events)` returns `(next_event_label, days_until)` for `top_k=1`, or a dict of top-k predictions with confidences when `top_k > 1`.
+## Architecture
+Cadence implements the NVC-Clean v14 champion model:
+- **Feature engineering**: 884-d handcrafted features (population anomaly scores, narrative velocity, temporal-gap statistics, cluster bag-of-words)
+- **Optional**: PubMedBERT embeddings (mean + last token, 1536-d) appended → 2420-d total input
+- **Backbone**: flat-MLP with BatchNorm (Linear 884→1024→1024→512 with residual skip)
+- **Classification head**: Asymmetric Loss (ASL, Ridnik et al. 2021)
+- **Regression head**: quantile-bin softmax expectation for time-to-next-event
+- **Training**: Phase 1 (frozen) + Phase 2 (full), MixUp augmentation, Stochastic Weight Averaging, self-knowledge distillation
+## Citation
+Manuscript in preparation; citation forthcoming.
+## License
+MIT. Copyright 2026 Amir Rouhollahi.
+## Links
+- GitHub: https://github.com/amirrouh/cadence
+- Issues: https://github.com/amirrouh/cadence/issues

cadence_core-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,73 @@
+# Cadence
+Clinical next-event prediction: a flat-MLP with PubMedBERT-enriched features and self-knowledge distillation, trained on EHR event sequences.
+## Install
+```bash
+pip install cadence-core
+```
+## Quickstart
+### Inference with a pretrained model
+```python
+from cadence import Cadence
+model = Cadence.from_pretrained("amirrouh/cadence-mimic-100k")
+next_event, days_until = model.predict(patient_events)
+```
+### Training on your own data
+```python
+from cadence import Cadence
+model = Cadence()
+model.fit(events_df)
+model.save("my-model/")
+```
+## Input data format
+`events_df` is a pandas DataFrame with the following columns:
+- `patient_id` — patient identifier (any hashable type)
+- `timestamp` — event time (datetime or ISO string; coerced via `pd.to_datetime`)
+- `event_text` — free-text event description (e.g. "Patient admitted with chest pain")
+- `cluster_id` — integer event cluster (optional; auto-assigned via sentence-transformers + KMeans if omitted)
+Example:
+| patient_id | timestamp           | event_text                          | cluster_id |
+|------------|---------------------|-------------------------------------|------------|
+| P001       | 2024-01-15 09:30    | Patient admitted with chest pain    | 3          |
+| P001       | 2024-01-15 11:45    | ECG performed, ST elevation         | 7          |
+| P002       | 2024-02-03 14:20    | Routine check-up, vitals normal     | 1          |
+`.predict(patient_events)` returns `(next_event_label, days_until)` for `top_k=1`, or a dict of top-k predictions with confidences when `top_k > 1`.
+## Architecture
+Cadence implements the NVC-Clean v14 champion model:
+- **Feature engineering**: 884-d handcrafted features (population anomaly scores, narrative velocity, temporal-gap statistics, cluster bag-of-words)
+- **Optional**: PubMedBERT embeddings (mean + last token, 1536-d) appended → 2420-d total input
+- **Backbone**: flat-MLP with BatchNorm (Linear 884→1024→1024→512 with residual skip)
+- **Classification head**: Asymmetric Loss (ASL, Ridnik et al. 2021)
+- **Regression head**: quantile-bin softmax expectation for time-to-next-event
+- **Training**: Phase 1 (frozen) + Phase 2 (full), MixUp augmentation, Stochastic Weight Averaging, self-knowledge distillation
+## Citation
+Manuscript in preparation; citation forthcoming.
+## License
+MIT. Copyright 2026 Amir Rouhollahi.
+## Links
+- GitHub: https://github.com/amirrouh/cadence
+- Issues: https://github.com/amirrouh/cadence/issues

cadence_core-0.1.0/cadence/__init__.py ADDED Viewed

@@ -0,0 +1,430 @@
+"""Cadence: flat-MLP with PubMedBERT-enriched self-distillation for
+clinical next-event prediction.
+Quick start
+-----------
+Inference with a pretrained model::
+    from cadence import Cadence
+    model = Cadence.from_pretrained("amirrouh/cadence-mimic-100k")
+    next_event, days_until = model.predict(patient_events)
+Training on your own data::
+    from cadence import Cadence
+    model = Cadence()
+    model.fit(events_df)
+    model.save("my-model/")
+See README.md and examples/quickstart.py for a complete walkthrough.
+"""
+from __future__ import annotations
+import json
+import logging
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from .config import CadenceConfig
+from .model import NVCFlatMLP
+from .features import (
+    build_population_prior,
+    build_feature_matrix,
+    extract_features,
+    LOG_DAYS_CLIP,
+)
+from .data import events_df_to_records, CadenceDataset, validate_events_df
+from .trainer import CadenceTrainer, compute_quantile_bins
+from .pretrained import save_checkpoint, load_checkpoint, download_from_hub
+__version__ = "0.1.0"
+__all__ = ["Cadence", "CadenceConfig", "__version__"]
+log = logging.getLogger(__name__)
+class Cadence:
+    """High-level API for training, inference, and checkpoint management.
+    Parameters
+    ----------
+    config : CadenceConfig or None
+        Hyperparameter configuration. Defaults to ``CadenceConfig()`` (50
+        clusters, 884-d features, NVC-Clean v14 champion settings).
+    Examples
+    --------
+    >>> model = Cadence()
+    >>> model.fit(events_df)                          # trains on your data
+    >>> next_event, days = model.predict(patient_df)  # single-patient inference
+    >>> model.save("my-model/")
+    >>> model2 = Cadence.from_pretrained("my-model/")
+    """
+    def __init__(self, config: Optional[CadenceConfig] = None) -> None:
+        self.config = config or CadenceConfig()
+        self._model: Optional[NVCFlatMLP] = None
+        self._clusterer = None          # CadenceClusterer | None
+        self._prior: Optional[dict] = None
+        self._bin_centers: Optional[np.ndarray] = None
+        self._bin_edges: Optional[np.ndarray] = None
+        self._cluster_labels: Optional[dict] = None
+        self._device = torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu"
+        )
+    # ------------------------------------------------------------------
+    # Fit
+    # ------------------------------------------------------------------
+    def fit(
+        self,
+        events_df,           # pd.DataFrame
+        epochs: Optional[int] = None,
+        val_df=None,         # pd.DataFrame | None — if None, 10 % split used
+        verbose: bool = True,
+    ) -> "Cadence":
+        """Train Cadence on ``events_df``.
+        Parameters
+        ----------
+        events_df : pd.DataFrame
+            Columns: ``patient_id``, ``timestamp``, ``event_text``.
+            Optional column ``cluster_id`` (skips auto-clustering when present).
+        epochs : int or None
+            Total training epochs. Defaults to
+            ``config.phase1_epochs + config.phase2_epochs``.
+        val_df : pd.DataFrame or None
+            Validation dataframe. When None, 10 % of patients are held out.
+        verbose : bool
+            Whether to log training progress.
+        Returns
+        -------
+        self
+        """
+        if verbose:
+            logging.basicConfig(
+                level=logging.INFO,
+                format="%(asctime)s  %(levelname)-8s  %(message)s",
+                datefmt="%H:%M:%S",
+            )
+        validate_events_df(events_df)
+        cfg = self.config
+        # ── Fit clusters if needed ────────────────────────────────────────────
+        if "cluster_id" not in events_df.columns:
+            self._fit_clusters_from_df(events_df)
+        # ── Train / val split ─────────────────────────────────────────────────
+        if val_df is None:
+            events_df, val_df = self._split_patients(events_df, val_frac=0.1)
+        # ── Build records ─────────────────────────────────────────────────────
+        train_records = events_df_to_records(
+            events_df, clusterer=self._clusterer,
+            n_clusters=cfg.n_clusters, max_history=cfg.max_history,
+        )
+        val_records = events_df_to_records(
+            val_df, clusterer=self._clusterer,
+            n_clusters=cfg.n_clusters, max_history=cfg.max_history,
+        )
+        log.info(
+            "Records: train=%d, val=%d", len(train_records), len(val_records)
+        )
+        # ── Population prior ──────────────────────────────────────────────────
+        self._prior = build_population_prior(train_records, cfg.n_clusters)
+        # ── Feature matrices ──────────────────────────────────────────────────
+        X_tr, y_cls_tr, y_reg_tr = build_feature_matrix(
+            train_records, self._prior, cfg.n_clusters, cfg.max_history
+        )
+        X_val, y_cls_val, y_reg_val = build_feature_matrix(
+            val_records, self._prior, cfg.n_clusters, cfg.max_history
+        )
+        log.info("Feature matrix: train=%s, val=%s", X_tr.shape, X_val.shape)
+        # Actual feature dim may differ from config default (user data)
+        n_features = X_tr.shape[1]
+        cfg.n_features = n_features
+        # ── Quantile bins ─────────────────────────────────────────────────────
+        bin_edges, bin_centers = compute_quantile_bins(y_reg_tr, cfg.n_reg_bins)
+        self._bin_edges = bin_edges
+        self._bin_centers = bin_centers
+        # ── DataLoaders ───────────────────────────────────────────────────────
+        from torch.utils.data import DataLoader
+        train_ds = CadenceDataset(X_tr, y_cls_tr, y_reg_tr)
+        val_ds = CadenceDataset(X_val, y_cls_val, y_reg_val)
+        train_loader = DataLoader(
+            train_ds, batch_size=cfg.batch_size, shuffle=True,
+            num_workers=cfg.num_workers, pin_memory=self._device.type == "cuda",
+        )
+        val_loader = DataLoader(
+            val_ds, batch_size=cfg.batch_size * 2, shuffle=False,
+            num_workers=cfg.num_workers,
+        )
+        # ── Build model ───────────────────────────────────────────────────────
+        bin_centers_t = torch.tensor(bin_centers, dtype=torch.float32)
+        self._model = NVCFlatMLP(
+            n_features=n_features,
+            n_classes=cfg.n_clusters,
+            bin_centers=bin_centers_t,
+            config=cfg,
+        ).to(self._device)
+        log.info(
+            "NVCFlatMLP: n_features=%d, n_classes=%d, params=%d",
+            n_features, cfg.n_clusters, self._model.n_params,
+        )
+        # ── Train ─────────────────────────────────────────────────────────────
+        trainer = CadenceTrainer(
+            model=self._model,
+            config=cfg,
+            device=self._device,
+            bin_edges=bin_edges,
+            bin_centers=bin_centers,
+        )
+        self._model = trainer.fit(train_loader, val_loader, epochs=epochs)
+        return self
+    # ------------------------------------------------------------------
+    # Predict
+    # ------------------------------------------------------------------
+    def predict(
+        self,
+        patient_events,  # pd.DataFrame — single patient, sorted by timestamp
+        top_k: int = 1,
+    ) -> Union[Tuple[str, float], dict]:
+        """Predict the next event and days-until for one patient.
+        Parameters
+        ----------
+        patient_events : pd.DataFrame
+            History for a single patient. Same schema as ``events_df``
+            (columns: ``patient_id``, ``timestamp``, ``event_text``).
+            Must have at least 1 row.
+        top_k : int
+            When 1, returns ``(event_label, days)``.
+            When > 1, returns a dict with ``predictions`` (list of
+            ``{label, cluster_id, confidence, days}``).
+        Returns
+        -------
+        (next_event_label, days_until) when top_k=1, else dict.
+        """
+        if self._model is None:
+            raise RuntimeError(
+                "Model is not trained. Call .fit() or .from_pretrained() first."
+            )
+        if self._prior is None:
+            raise RuntimeError(
+                "Population prior is missing. The model may not have been "
+                "trained with .fit()."
+            )
+        validate_events_df(patient_events)
+        # Build record
+        records = events_df_to_records(
+            patient_events,
+            clusterer=self._clusterer,
+            n_clusters=self.config.n_clusters,
+            max_history=self.config.max_history,
+        )
+        if not records:
+            raise ValueError(
+                "patient_events must have at least 2 rows to form one "
+                "prediction example (history + target)."
+            )
+        # Use the last record (most recent history window)
+        record = records[-1]
+        feat = extract_features(
+            record, self._prior,
+            n_clusters=self.config.n_clusters,
+            max_history=self.config.max_history,
+        )
+        X = torch.tensor(feat, dtype=torch.float32).unsqueeze(0).to(self._device)
+        self._model.eval()
+        with torch.no_grad():
+            logits, reg_logits = self._model(X)
+            days = self._model.predict_days(reg_logits).item()
+        probs = F.softmax(logits, dim=-1).squeeze(0).cpu().numpy()
+        if top_k == 1:
+            best_cid = int(probs.argmax())
+            label = self._cluster_label(best_cid)
+            return label, days
+        # top_k > 1
+        top_ids = np.argsort(-probs)[:top_k]
+        preds = [
+            {
+                "label": self._cluster_label(int(cid)),
+                "cluster_id": int(cid),
+                "confidence": float(probs[cid]),
+                "days": days,
+            }
+            for cid in top_ids
+        ]
+        return {"predictions": preds}
+    def _cluster_label(self, cluster_id: int) -> str:
+        if self._cluster_labels and str(cluster_id) in self._cluster_labels:
+            return self._cluster_labels[str(cluster_id)]
+        if self._cluster_labels and cluster_id in self._cluster_labels:
+            return self._cluster_labels[cluster_id]
+        return f"cluster_{cluster_id}"
+    # ------------------------------------------------------------------
+    # Save / load
+    # ------------------------------------------------------------------
+    def save(self, directory: Union[str, Path]) -> None:
+        """Save the model, config, and clusterer to ``directory``.
+        Parameters
+        ----------
+        directory : str | Path
+        """
+        if self._model is None:
+            raise RuntimeError("No model to save. Call .fit() first.")
+        save_checkpoint(
+            model=self._model,
+            config=self.config,
+            bin_centers=self._bin_centers,
+            save_dir=directory,
+            clusterer=self._clusterer,
+            cluster_labels=self._cluster_labels,
+            extra={"prior": self._prior},
+        )
+    @classmethod
+    def from_pretrained(
+        cls,
+        path_or_repo: Union[str, Path],
+        device: Optional[Union[str, torch.device]] = None,
+        revision: Optional[str] = None,
+    ) -> "Cadence":
+        """Load a Cadence model from a local directory or HuggingFace Hub.
+        Parameters
+        ----------
+        path_or_repo : str | Path
+            Local directory path OR HuggingFace repo ID (e.g.
+            ``"amirrouh/cadence-mimic-100k"``).
+        device : str | torch.device | None
+        revision : str | None
+            HuggingFace revision / tag (ignored for local paths).
+        Returns
+        -------
+        Cadence instance, ready for inference.
+        """
+        local_path = Path(path_or_repo)
+        if not local_path.exists():
+            # Try HuggingFace Hub
+            local_path = download_from_hub(
+                str(path_or_repo), revision=revision
+            )
+        model_obj, config, bin_centers, clusterer, cluster_labels = load_checkpoint(
+            local_path, device=device
+        )
+        # Restore population prior if saved in config.json
+        cfg_dict = json.loads((local_path / "config.json").read_text())
+        prior = cfg_dict.get("prior", None)
+        instance = cls(config=config)
+        instance._model = model_obj
+        instance._clusterer = clusterer
+        instance._bin_centers = bin_centers
+        instance._cluster_labels = cluster_labels
+        instance._prior = prior
+        if device is not None:
+            instance._device = torch.device(device)
+        else:
+            instance._device = next(model_obj.parameters()).device
+        return instance
+    # ------------------------------------------------------------------
+    # Cluster helpers
+    # ------------------------------------------------------------------
+    def fit_clusters(
+        self,
+        texts: List[str],
+        n_clusters: int = 50,
+        encoder_model: str = "all-MiniLM-L6-v2",
+    ) -> "Cadence":
+        """Fit event-text clusters from a list of raw event strings.
+        Call this before ``fit()`` if you want to control the cluster
+        fitting step explicitly.
+        Parameters
+        ----------
+        texts : list of str
+        n_clusters : int
+        encoder_model : str
+        Returns
+        -------
+        self
+        """
+        from .clustering import CadenceClusterer
+        self._clusterer = CadenceClusterer(
+            n_clusters=n_clusters, encoder_model=encoder_model
+        ).fit(texts)
+        self.config.n_clusters = n_clusters
+        return self
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+    def _fit_clusters_from_df(self, events_df) -> None:
+        """Auto-fit clusters from unique event texts in events_df."""
+        from .clustering import CadenceClusterer
+        texts = events_df["event_text"].dropna().unique().tolist()
+        log.info(
+            "Auto-fitting clusters: %d unique event texts → %d clusters",
+            len(texts), self.config.n_clusters,
+        )
+        self._clusterer = CadenceClusterer(
+            n_clusters=self.config.n_clusters,
+            encoder_model=self.config.cluster_encoder,
+        ).fit(texts)
+    @staticmethod
+    def _split_patients(df, val_frac: float = 0.1):
+        """Hold out val_frac of patients as the validation set."""
+        import pandas as pd
+        patients = np.array(df["patient_id"].unique())
+        np.random.shuffle(patients)
+        n_val = max(1, int(len(patients) * val_frac))
+        val_patients = set(patients[:n_val])
+        train_df = df[~df["patient_id"].isin(val_patients)].copy()
+        val_df = df[df["patient_id"].isin(val_patients)].copy()
+        return train_df, val_df