PyPI - cehrgpt - Versions diffs - 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl - Mend

cehrgpt 0.1.2py3-none-any.whl → 0.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

cehrgpt/analysis/htn_treatment_pathway.py +546 -0
cehrgpt/analysis/treatment_pathway/__init__.py +0 -0
cehrgpt/analysis/treatment_pathway/depression_treatment_pathway.py +94 -0
cehrgpt/analysis/treatment_pathway/diabetes_treatment_pathway.py +94 -0
cehrgpt/analysis/treatment_pathway/htn_treatment_pathway.py +94 -0
cehrgpt/analysis/treatment_pathway/treatment_pathway.py +631 -0
cehrgpt/data/cehrgpt_data_processor.py +549 -0
cehrgpt/data/hf_cehrgpt_dataset.py +4 -0
cehrgpt/data/hf_cehrgpt_dataset_collator.py +285 -652
cehrgpt/data/hf_cehrgpt_dataset_mapping.py +38 -5
cehrgpt/generation/cehrgpt_conditional_generation.py +2 -0
cehrgpt/generation/generate_batch_hf_gpt_sequence.py +20 -12
cehrgpt/generation/omop_converter_batch.py +11 -4
cehrgpt/gpt_utils.py +73 -3
cehrgpt/models/activations.py +27 -0
cehrgpt/models/config.py +6 -2
cehrgpt/models/gpt2.py +560 -0
cehrgpt/models/hf_cehrgpt.py +183 -460
cehrgpt/models/tokenization_hf_cehrgpt.py +380 -50
cehrgpt/omop/ontology.py +154 -0
cehrgpt/runners/hf_cehrgpt_finetune_runner.py +24 -78
cehrgpt/runners/hf_cehrgpt_pretrain_runner.py +48 -44
cehrgpt/runners/hf_gpt_runner_argument_dataclass.py +46 -34
cehrgpt/runners/hyperparameter_search_util.py +180 -69
cehrgpt/runners/sample_packing_trainer.py +11 -2
cehrgpt/tools/linear_prob/compute_cehrgpt_features.py +8 -2
cehrgpt-0.1.4.dist-info/METADATA +238 -0
{cehrgpt-0.1.2.dist-info → cehrgpt-0.1.4.dist-info}/RECORD +32 -22
cehrgpt-0.1.2.dist-info/METADATA +0 -209
/cehrgpt/tools/{merge_synthetic_real_dataasets.py → merge_synthetic_real_datasets.py} +0 -0
{cehrgpt-0.1.2.dist-info → cehrgpt-0.1.4.dist-info}/WHEEL +0 -0
{cehrgpt-0.1.2.dist-info → cehrgpt-0.1.4.dist-info}/licenses/LICENSE +0 -0
{cehrgpt-0.1.2.dist-info → cehrgpt-0.1.4.dist-info}/top_level.txt +0 -0

cehrgpt/runners/hyperparameter_search_util.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from functools import partial
-from typing import Callable, Tuple
+from typing import Callable, List, Optional, Tuple, Union
 import optuna
 from cehrbert.runners.hf_runner_argument_dataclass import ModelArguments
@@ -64,28 +64,99 @@ class OptunaMetricCallback(TrainerCallback):
             metrics.update({"optuna_best_metric": metrics["eval_loss"]})
-# Define the hyperparameter search space with parameters
-def hp_space(
-    trial: optuna.Trial,
-    lr_range: Tuple[float, float] = (1e-5, 5e-5),
-    batch_sizes=None,
-    weight_decays: Tuple[float, float] = (1e-4, 1e-2),
-    num_train_epochs: Tuple[float, ...] = 10,
-):
-    if batch_sizes is None:
-        batch_sizes = [4, 8]
+def get_suggestion(
+    trial,
+    hyperparameter_name: str,
+    hyperparameters: List[Union[float, int]],
+    is_grid: bool = False,
+) -> Union[float, int]:
+    """
+    Get hyperparameter suggestion based on search mode.
+    Args:
+        trial: Optuna trial object
+        hyperparameter_name: Name of the hyperparameter
+        hyperparameters: List of hyperparameter values
+        is_grid: Whether to use grid search mode
+    Returns:
+        Suggested hyperparameter value
+    Raises:
+        RuntimeError: If Bayesian mode is used with incorrect number of bounds
+    """
+    if is_grid:
+        return trial.suggest_categorical(hyperparameter_name, hyperparameters)
+    # For Bayesian optimization, we need exactly 2 values (lower and upper bounds)
+    if len(hyperparameters) != 2:
+        raise RuntimeError(
+            f"{hyperparameter_name} must contain exactly two values (lower and upper bound) "
+            f"for Bayesian Optimization, but {len(hyperparameters)} values were provided: {hyperparameters}"
+        )
+    # Ensure bounds are sorted
+    lower, upper = sorted(hyperparameters)
+    return trial.suggest_float(hyperparameter_name, lower, upper, log=True)
+def hp_space(trial: optuna.Trial, cehrgpt_args: CehrGPTArguments):
+    """
+    Define the hyperparameter search space.
+    Args:
+        trial: Optuna trial object
+        cehrgpt_args: CehrGPTArguments
+    Returns:
+        Dictionary of hyperparameter suggestions
+    """
+    is_grid = cehrgpt_args.hyperparameter_tuning_is_grid
+    learning_rates = cehrgpt_args.hyperparameter_learning_rates
+    weight_decays = cehrgpt_args.hyperparameter_weight_decays
+    batch_sizes = cehrgpt_args.hyperparameter_batch_sizes
+    num_train_epochs = cehrgpt_args.hyperparameter_num_train_epochs
     return {
-        "learning_rate": trial.suggest_float("learning_rate", *lr_range, log=True),
+        "learning_rate": get_suggestion(
+            trial, "learning_rate", learning_rates, is_grid
+        ),
         "per_device_train_batch_size": trial.suggest_categorical(
             "per_device_train_batch_size", batch_sizes
         ),
-        "weight_decay": trial.suggest_float("weight_decay", *weight_decays, log=True),
+        "weight_decay": get_suggestion(trial, "weight_decay", weight_decays, is_grid),
         "num_train_epochs": trial.suggest_categorical(
             "num_train_epochs", num_train_epochs
         ),
     }
+def create_grid_search_space(cehrgpt_args: CehrGPTArguments):
+    """
+    Create the search space dictionary for GridSampler.
+    Args:
+        cehrgpt_args: CehrGPTArguments
+    Returns:
+        Dictionary defining the grid search space
+    """
+    return {
+        "learning_rate": cehrgpt_args.hyperparameter_learning_rates,
+        "weight_decay": cehrgpt_args.hyperparameter_weight_decays,
+        "per_device_train_batch_size": cehrgpt_args.hyperparameter_batch_sizes,
+        "num_train_epochs": cehrgpt_args.hyperparameter_num_train_epochs,
+    }
+def calculate_total_combinations(search_space: dict) -> int:
+    """Calculate total number of combinations in grid search."""
+    total = 1
+    for values in search_space.values():
+        total *= len(values)
+    return total
 def sample_dataset(data: Dataset, percentage: float, seed: int) -> Dataset:
     """
     Samples a subset of the given dataset based on a specified percentage.
@@ -113,7 +184,7 @@ def sample_dataset(data: Dataset, percentage: float, seed: int) -> Dataset:
           returns the "test" portion, which is the specified percentage of the dataset.
         - Ensure that `percentage` is between 0 and 1 to avoid errors.
     """
-    if percentage == 1.0:
+    if percentage >= 1.0:
         return data
     return data.train_test_split(
@@ -130,14 +201,13 @@ def perform_hyperparameter_search(
     training_args: TrainingArguments,
     model_args: ModelArguments,
     cehrgpt_args: CehrGPTArguments,
-) -> TrainingArguments:
+) -> Tuple[TrainingArguments, Optional[str]]:
     """
     Perform hyperparameter tuning for the CehrGPT model using Optuna with the Hugging Face Trainer.
-    This function initializes a Trainer with sampled training and validation sets, and performs
-    a hyperparameter search using Optuna. The search tunes learning rate, batch size, and weight decay
-    to optimize model performance based on a specified objective metric (e.g., validation loss).
-    After the search, it updates the provided `TrainingArguments` with the best hyperparameters found.
+    This function supports two modes:
+    1. Bayesian Optimization (TPE): Intelligently explores hyperparameter space using bounds
+    2. Grid Search: Exhaustively tests all combinations of discrete values
     Args:
         trainer_class: A Trainer or its subclass
@@ -147,15 +217,15 @@ def perform_hyperparameter_search(
         training_args (TrainingArguments): Configuration for training parameters (e.g., epochs, evaluation strategy).
         model_args (ModelArguments): Model configuration arguments, including early stopping parameters.
         cehrgpt_args (CehrGPTArguments): Additional arguments specific to CehrGPT, including hyperparameter
-                                         tuning options such as learning rate range, batch sizes, and tuning percentage.
+                                         tuning options and search mode configuration.
     Returns:
-        TrainingArguments: Updated `TrainingArguments` instance containing the best hyperparameters found
-                           from the search.
+        Tuple[TrainingArguments, Optional[str]]: Updated TrainingArguments with best hyperparameters
+                                               and optional run_id of the best trial.
     Example:
         ```
-        best_training_args = perform_hyperparameter_search(
+        best_training_args, run_id = perform_hyperparameter_search(
             trainer_class=Trainer,
             model_init=my_model_init,
             dataset=my_dataset_dict,
@@ -176,50 +246,91 @@ def perform_hyperparameter_search(
     Logging:
         Logs the best hyperparameters found at the end of the search.
     """
-    if cehrgpt_args.hyperparameter_tuning:
-        sampled_train = sample_dataset(
-            dataset["train"],
-            cehrgpt_args.hyperparameter_tuning_percentage,
-            training_args.seed,
-        )
-        sampled_val = sample_dataset(
-            dataset["validation"],
-            cehrgpt_args.hyperparameter_tuning_percentage,
-            training_args.seed,
-        )
-        hyperparam_trainer = trainer_class(
-            model_init=model_init,
-            data_collator=data_collator,
-            train_dataset=sampled_train,
-            eval_dataset=sampled_val,
-            callbacks=[
-                EarlyStoppingCallback(model_args.early_stopping_patience),
-                OptunaMetricCallback(),
-            ],
-            args=training_args,
-        )
-        # Perform hyperparameter search
-        best_trial = hyperparam_trainer.hyperparameter_search(
-            direction="minimize",
-            hp_space=partial(
-                hp_space,
-                lr_range=(cehrgpt_args.lr_low, cehrgpt_args.lr_high),
-                weight_decays=(
-                    cehrgpt_args.weight_decays_low,
-                    cehrgpt_args.weight_decays_high,
-                ),
-                batch_sizes=cehrgpt_args.hyperparameter_batch_sizes,
-                num_train_epochs=cehrgpt_args.hyperparameter_num_train_epochs,
-            ),
-            backend="optuna",
-            n_trials=cehrgpt_args.n_trials,
-            compute_objective=lambda m: m["optuna_best_metric"],
-            # Ensure reproducibility
-            sampler=optuna.samplers.TPESampler(seed=training_args.seed),
-        )
-        LOG.info("Best hyperparameters: %s", best_trial.hyperparameters)
-        # Update training arguments with best hyperparameters and set epochs based on adjusted effective epochs
-        for k, v in best_trial.hyperparameters.items():
-            setattr(training_args, k, v)
+    if not cehrgpt_args.hyperparameter_tuning:
+        return training_args, None
+    # Prepare hyperparameters based on mode
+    if (
+        cehrgpt_args.hyperparameter_tuning_is_grid
+        and cehrgpt_args.hyperparameter_tuning_is_grid
+    ):
+        search_space = create_grid_search_space(cehrgpt_args)
+        total_combinations = calculate_total_combinations(search_space)
+        LOG.info(f"Grid search mode: Testing {total_combinations} combinations")
+        LOG.info(f"Search space: {search_space}")
+        # Adjust n_trials for grid search if not set appropriately
+        if cehrgpt_args.n_trials < total_combinations:
+            LOG.warning(
+                f"n_trials ({cehrgpt_args.n_trials}) is less than total combinations ({total_combinations}). "
+                f"Setting n_trials to {total_combinations} to test all combinations."
+            )
+            cehrgpt_args.n_trials = total_combinations
+        # Configure sampler based on search mode
+        sampler = optuna.samplers.GridSampler(search_space, seed=training_args.seed)
+    else:
+        LOG.info("Bayesian optimization mode (TPE)")
+        LOG.info(f"Learning rate bounds: {cehrgpt_args.hyperparameter_learning_rates}")
+        LOG.info(f"Weight decay bounds: {cehrgpt_args.hyperparameter_weight_decays}")
+        LOG.info(f"Batch sizes: {cehrgpt_args.hyperparameter_batch_sizes}")
+        LOG.info(f"Epochs: {cehrgpt_args.hyperparameter_num_train_epochs}")
+        # Configure the TPE sampler
+        sampler = optuna.samplers.TPESampler(seed=training_args.seed)
+    # Prepare datasets
+    save_total_limit_original = training_args.save_total_limit
+    training_args.save_total_limit = 1
+    sampled_train = sample_dataset(
+        dataset["train"],
+        cehrgpt_args.hyperparameter_tuning_percentage,
+        training_args.seed,
+    )
+    sampled_val = sample_dataset(
+        dataset["validation"],
+        cehrgpt_args.hyperparameter_tuning_percentage,
+        training_args.seed,
+    )
+    # Create trainer
+    hyperparam_trainer = trainer_class(
+        model_init=model_init,
+        data_collator=data_collator,
+        train_dataset=sampled_train,
+        eval_dataset=sampled_val,
+        callbacks=[
+            EarlyStoppingCallback(model_args.early_stopping_patience),
+            OptunaMetricCallback(),
+        ],
+        args=training_args,
+    )
+    best_trial = hyperparam_trainer.hyperparameter_search(
+        direction="minimize",
+        hp_space=partial(
+            hp_space,
+            cehrgpt_args=cehrgpt_args,
+        ),
+        backend="optuna",
+        n_trials=cehrgpt_args.n_trials,
+        compute_objective=lambda m: m["optuna_best_metric"],
+        sampler=sampler,
+    )
+    # Log results
+    LOG.info("=" * 50)
+    LOG.info("HYPERPARAMETER SEARCH COMPLETED")
+    LOG.info("=" * 50)
+    LOG.info(f"Best hyperparameters: {best_trial.hyperparameters}")
+    LOG.info(f"Best metric (eval_loss): {best_trial.objective}")
+    LOG.info(f"Best run_id: {best_trial.run_id}")
+    LOG.info("=" * 50)
+    # Restore original settings and update with best hyperparameters
+    training_args.save_total_limit = save_total_limit_original
+    for k, v in best_trial.hyperparameters.items():
+        setattr(training_args, k, v)
+        LOG.info(f"Updated training_args.{k} = {v}")
-    return training_args
+    return training_args, best_trial.run_id

cehrgpt/runners/sample_packing_trainer.py CHANGED Viewed

@@ -1,9 +1,10 @@
 from typing import Optional, Union
+import torch
 from datasets import Dataset
 from torch.utils.data import DataLoader
 from transformers import Trainer
-from transformers.trainer_utils import has_length
+from transformers.trainer_utils import has_length, seed_worker
 from transformers.utils import import_utils, logging
 from cehrgpt.data.sample_packing_sampler import SamplePackingBatchSampler
@@ -62,7 +63,10 @@ class SamplePackingTrainer(Trainer):
             if "num_of_concepts" in train_dataset.column_names:
                 lengths = train_dataset["num_of_concepts"]
             else:
-                lengths = [len(sample["input_ids"]) for sample in train_dataset]
+                lengths = [
+                    len(sample["input_ids"])
+                    for sample in train_dataset.select_columns("input_ids")
+                ]
             LOG.info("Finished computing lengths for the train dataset")
         else:
@@ -102,6 +106,11 @@ class SamplePackingTrainer(Trainer):
             "persistent_workers": self.args.dataloader_persistent_workers,
             "batch_sampler": batch_sampler,
         }
+        if not isinstance(train_dataset, torch.utils.data.IterableDataset):
+            dataloader_params["drop_last"] = self.args.dataloader_drop_last
+            dataloader_params["worker_init_fn"] = seed_worker
+            dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor
         return self.accelerator.prepare(DataLoader(train_dataset, **dataloader_params))
     def get_eval_dataloader(

cehrgpt/tools/linear_prob/compute_cehrgpt_features.py CHANGED Viewed

@@ -9,11 +9,15 @@ from typing import Optional, Union
 import numpy as np
 import pandas as pd
+import polars as pl
 import torch
 import torch.distributed as dist
 from cehrbert.data_generators.hf_data_generator.meds_utils import CacheFileCollector
 from cehrbert.runners.runner_util import generate_prepared_ds_path
 from datasets import concatenate_datasets, load_from_disk
+from torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook import (
+    batched_powerSGD_hook,
+)
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 from transformers.trainer_utils import is_main_process
@@ -241,7 +245,6 @@ def main():
             SamplePackingCehrGptDataCollator,
             cehrgpt_args.max_tokens_per_batch,
             cehrgpt_model.config.max_position_embeddings,
-            add_end_token_in_sample_packing=cehrgpt_args.add_end_token_in_sample_packing,
         )
         train_batch_sampler = SamplePackingBatchSampler(
             lengths=train_set["num_of_concepts"],
@@ -332,7 +335,7 @@ def main():
     data_loaders = [("train", train_loader), ("test", test_dataloader)]
-    ve_token_id = cehrgpt_tokenizer._convert_token_to_id("[VE]")
+    ve_token_id = cehrgpt_tokenizer.ve_token_id
     for split, data_loader in data_loaders:
         # Ensure prediction folder exists
         feature_output_folder = (
@@ -383,6 +386,9 @@ def main():
                 if "epoch_times" in batch:
                     batch.pop("epoch_times")
+                if "ages" in batch:
+                    batch.pop("ages")
                 batch = {k: v.to(device) for k, v in batch.items()}
                 # Forward pass
                 cehrgpt_output = cehrgpt_model(

cehrgpt-0.1.4.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,238 @@
+Metadata-Version: 2.4
+Name: cehrgpt
+Version: 0.1.4
+Summary: CEHR-GPT: Generating Electronic Health Records with Chronological Patient Timelines
+Author-email: Chao Pang <chaopang229@gmail.com>, Xinzhuo Jiang <xj2193@cumc.columbia.edu>, Krishna Kalluri <kk3326@cumc.columbia.edu>, Elise Minto <em3697@cumc.columbia.edu>, Jason Patterson <jp3477@cumc.columbia.edu>, Nishanth Parameshwar Pavinkurve <np2689@cumc.columbia.edu>, Karthik Natarajan <kn2174@cumc.columbia.edu>
+License: MIT License
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Requires-Python: >=3.10.0
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: cehrbert>=1.4.8
+Requires-Dist: cehrbert_data>=0.1.1
+Requires-Dist: openai==1.54.3
+Requires-Dist: optuna==4.0.0
+Requires-Dist: transformers==4.44.1
+Requires-Dist: tokenizers==0.19.0
+Requires-Dist: peft==0.10.0
+Requires-Dist: lightgbm
+Requires-Dist: polars
+Provides-Extra: dev
+Requires-Dist: pre-commit; extra == "dev"
+Requires-Dist: pytest; extra == "dev"
+Requires-Dist: pytest-cov; extra == "dev"
+Requires-Dist: pytest-subtests; extra == "dev"
+Requires-Dist: rootutils; extra == "dev"
+Requires-Dist: hypothesis; extra == "dev"
+Requires-Dist: black; extra == "dev"
+Provides-Extra: flash-attn
+Requires-Dist: flash_attn; extra == "flash-attn"
+Dynamic: license-file
+# CEHRGPT
+[![PyPI - Version](https://img.shields.io/pypi/v/cehrgpt)](https://pypi.org/project/cehrgpt/)
+![Python](https://img.shields.io/badge/-Python_3.11-blue?logo=python&logoColor=white)
+[![tests](https://github.com/knatarajan-lab/cehrgpt/actions/workflows/tests.yaml/badge.svg)](https://github.com/knatarajan-lab/cehrgpt/actions/workflows/tests.yaml)
+[![license](https://img.shields.io/badge/License-MIT-green.svg?labelColor=gray)](https://github.com/knatarajan-lab/cehrgpt/blob/main/LICENSE)
+[![contributors](https://img.shields.io/github/contributors/knatarajan-lab/cehrgpt.svg)](https://github.com/knatarajan-lab/cehrgpt/graphs/contributors)
+CEHRGPT is a multi-task foundation model for structured electronic health records (EHR) data that supports three capabilities: feature representation, zero-shot prediction, and synthetic data generation.
+## 🎯 Key Capabilities
+### Feature Representation
+Extract meaningful patient embeddings from sequences of medical events using **linear probing** techniques for downstream tasks such as disease prediction, patient clustering, and risk stratification.
+### Zero-Shot Prediction
+Generate outcome predictions directly from prompts without requiring task-specific training, enabling rapid evaluation in low-label clinical settings.
+### Synthetic Data Generation
+Generate comprehensive patient profiles including demographics, medical history, treatment courses, and outcomes while implementing advanced privacy-preserving techniques to ensure generated data contains no identifiable information.
+The platform is fully compatible with the OMOP Common Data Model for seamless integration with existing healthcare systems.
+## 🚀 Installation
+Clone the repository and install dependencies:
+```bash
+git clone https://github.com/knatarajan-lab/cehrgpt.git
+cd cehrgpt
+pip install .
+```
+## 📋 Prerequisites
+Before getting started, set up the required environment variables:
+```bash
+export CEHRGPT_HOME=$(git rev-parse --show-toplevel)
+export OMOP_DIR=""                    # Path to your OMOP data
+export CEHR_GPT_DATA_DIR=""          # Path for processed data storage
+export CEHR_GPT_MODEL_DIR=""         # Path for model storage
+```
+Create the dataset cache directory:
+```bash
+mkdir $CEHR_GPT_DATA_DIR/dataset_prepared
+```
+## 🏗️ Model Training
+### Step 1: Generate Pre-training Data from OMOP
+Generate the training data following the [Data Generation Instruction](./data_generation.md).
+### Step 2: Pre-train CEHR-GPT
+Train the foundation model:
+```bash
+python -u -m cehrgpt.runners.hf_cehrgpt_pretrain_runner \
+  --model_name_or_path $CEHR_GPT_MODEL_DIR \
+  --tokenizer_name_or_path $CEHR_GPT_MODEL_DIR \
+  --output_dir $CEHR_GPT_MODEL_DIR \
+  --data_folder "$CEHR_GPT_DATA_DIR/patient_sequence/train" \
+  --dataset_prepared_path "$CEHR_GPT_DATA_DIR/dataset_prepared" \
+  --do_train true --seed 42 \
+  --dataloader_num_workers 16 --dataloader_prefetch_factor 8 \
+  --hidden_size 768 --num_hidden_layers 14 --max_position_embeddings 4096 \
+  --evaluation_strategy epoch --save_strategy epoch \
+  --sample_packing --max_tokens_per_batch 16384 \
+  --warmup_ratio 0.01 --weight_decay 0.01 \
+  --num_train_epochs 50 --learning_rate 0.0002 \
+  --use_early_stopping --early_stopping_threshold 0.001
+```
+> **Tip**: Increase `max_position_embeddings` for longer context windows based on your use case.
+## 🎯 Feature Representation
+CEHR-GPT enables extraction of meaningful patient embeddings from medical event sequences using **linear probing** techniques for downstream prediction tasks. The feature representation pipeline includes label generation, patient sequence extraction, and linear regression model training on the extracted representations.
+For detailed instructions including cohort creation, patient feature extraction, and linear probing evaluation, please follow the [Feature Representation Guide](./feature_representation.md).
+## 🔮 Zero-Shot Prediction
+CEHR-GPT can generate outcome predictions directly from clinical prompts without requiring task-specific training, making it ideal for rapid evaluation in low-label clinical settings. The zero-shot prediction capability performs time-to-event analysis by processing patient sequences and generating risk predictions based on learned medical patterns.
+For complete setup instructions including label generation, sequence preparation, and prediction execution, please follow the [Zero-Shot Prediction Guide](./zero_shot_prediction.md).
+## 🧬 Synthetic Data Generation
+CEHR-GPT generates comprehensive synthetic patient profiles including demographics, medical history, treatment courses, and outcomes while implementing advanced privacy-preserving techniques. The synthetic data maintains statistical fidelity to real patient populations without containing identifiable information, and outputs are fully compatible with the OMOP Common Data Model.
+For step-by-step instructions on generating synthetic sequences and converting them to OMOP format, please follow the [Synthetic Data Generation Guide](./synthetic_data_generation.md).
+## 📊 MEDS Support
+CEHR-GPT supports the Medical Event Data Standard (MEDS) format for enhanced interoperability.
+### Prerequisites
+Configure MEDS-specific environment variables:
+```bash
+export CEHR_GPT_MODEL_DIR=""    # CEHR-GPT model directory
+export MEDS_DIR=""              # MEDS data directory
+export MEDS_READER_DIR=""       # MEDS reader output directory
+```
+### Step 1: Create MIMIC MEDS Data
+Transform MIMIC files to MEDS format following the [MEDS_transforms](https://github.com/mmcdermott/MEDS_transforms/) repository instructions.
+### Step 2: Prepare MEDS Reader
+Convert MEDS data for CEHR-GPT compatibility:
+```bash
+meds_reader_convert $MEDS_DIR $MEDS_READER_DIR --num_threads 10
+```
+### Step 3: Pre-train with MEDS Data
+Execute pre-training using MEDS format:
+```bash
+python -u -m cehrgpt.runners.hf_cehrgpt_pretrain_runner \
+  --model_name_or_path $CEHR_GPT_MODEL_DIR \
+  --tokenizer_name_or_path $CEHR_GPT_MODEL_DIR \
+  --output_dir $CEHR_GPT_MODEL_DIR \
+  --data_folder $MEDS_READER_DIR \
+  --dataset_prepared_path "$CEHR_GPT_MODEL_DIR/dataset_prepared" \
+  --do_train true --seed 42 \
+  --dataloader_num_workers 16 --dataloader_prefetch_factor 8 \
+  --hidden_size 768 --num_hidden_layers 14 --max_position_embeddings 8192 \
+  --evaluation_strategy epoch --save_strategy epoch \
+  --sample_packing --max_tokens_per_batch 16384 \
+  --warmup_steps 500 --weight_decay 0.01 \
+  --num_train_epochs 50 --learning_rate 0.0002 \
+  --use_early_stopping --early_stopping_threshold 0.001 \
+  --is_data_in_meds --inpatient_att_function_type day \
+  --att_function_type day --include_inpatient_hour_token \
+  --include_auxiliary_token --include_demographic_prompt \
+  --meds_to_cehrbert_conversion_type "MedsToBertMimic4"
+```
+### Step 4: Generate MEDS Trajectories
+#### Environment Setup
+Configure trajectory generation environment:
+```bash
+export MEDS_LABEL_COHORT_DIR=""     # Cohort labels directory (parquet files)
+export MEDS_TRAJECTORY_DIR=""       # Trajectory output directory
+```
+#### Generate Synthetic Trajectories
+Create patient trajectories with the trained model:
+```bash
+python -u -m cehrgpt.generation.cehrgpt_conditional_generation \
+  --cohort_folder $MEDS_LABEL_COHORT_DIR \
+  --data_folder $MEDS_READER_DIR \
+  --dataset_prepared_path "$CEHR_GPT_MODEL_DIR/dataset_prepared" \
+  --model_name_or_path $CEHR_GPT_MODEL_DIR \
+  --tokenizer_name_or_path $CEHR_GPT_MODEL_DIR \
+  --output_dir $MEDS_TRAJECTORY_DIR \
+  --per_device_eval_batch_size 16 \
+  --num_of_trajectories_per_sample 2 \
+  --generation_input_length 4096 \
+  --generation_max_new_tokens 4096 \
+  --is_data_in_meds \
+  --att_function_type day --inpatient_att_function_type day \
+  --meds_to_cehrbert_conversion_type MedsToBertMimic4 \
+  --include_auxiliary_token --include_demographic_prompt \
+  --include_inpatient_hour_token
+```
+> **Important**: Ensure `generation_input_length` + `generation_max_new_tokens` ≤ `max_position_embeddings` (8192).
+#### Parameter Reference
+- `generation_input_length`: Input context length for generation
+- `generation_max_new_tokens`: Maximum new tokens to generate
+- `num_of_trajectories_per_sample`: Number of trajectories per patient sample
+## 📖 Citation
+If you use CEHRGPT in your research, please cite:
+```bibtex
+@article{cehrgpt2024,
+  title={CEHRGPT: Synthetic Data Generation for Electronic Health Records},
+  author={Natarajan, K and others},
+  journal={arXiv preprint arXiv:2402.04400},
+  year={2024}
+}
+```
+## 📄 License
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.

cehrgpt 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

cehrgpt 0.1.2py3-none-any.whl → 0.1.4py3-none-any.whl