PyPI - cehrgpt - Versions diffs - 0.0.1__py3-none-any.whl - Mend

cehrgpt 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

__init__.py +0 -0
cehrgpt/__init__.py +0 -0
cehrgpt/analysis/__init__.py +0 -0
cehrgpt/analysis/privacy/__init__.py +0 -0
cehrgpt/analysis/privacy/attribute_inference.py +275 -0
cehrgpt/analysis/privacy/attribute_inference_config.yml +8975 -0
cehrgpt/analysis/privacy/member_inference.py +172 -0
cehrgpt/analysis/privacy/nearest_neighbor_inference.py +189 -0
cehrgpt/analysis/privacy/reid_inference.py +407 -0
cehrgpt/analysis/privacy/utils.py +255 -0
cehrgpt/cehrgpt_args.py +142 -0
cehrgpt/data/__init__.py +0 -0
cehrgpt/data/hf_cehrgpt_dataset.py +80 -0
cehrgpt/data/hf_cehrgpt_dataset_collator.py +482 -0
cehrgpt/data/hf_cehrgpt_dataset_mapping.py +116 -0
cehrgpt/generation/__init__.py +0 -0
cehrgpt/generation/chatgpt_generation.py +106 -0
cehrgpt/generation/generate_batch_hf_gpt_sequence.py +333 -0
cehrgpt/generation/omop_converter_batch.py +644 -0
cehrgpt/generation/omop_entity.py +515 -0
cehrgpt/gpt_utils.py +331 -0
cehrgpt/models/__init__.py +0 -0
cehrgpt/models/config.py +205 -0
cehrgpt/models/hf_cehrgpt.py +1817 -0
cehrgpt/models/hf_modeling_outputs.py +158 -0
cehrgpt/models/pretrained_embeddings.py +82 -0
cehrgpt/models/special_tokens.py +30 -0
cehrgpt/models/tokenization_hf_cehrgpt.py +1077 -0
cehrgpt/omop/__init__.py +0 -0
cehrgpt/omop/condition_era.py +20 -0
cehrgpt/omop/observation_period.py +43 -0
cehrgpt/omop/omop_argparse.py +38 -0
cehrgpt/omop/omop_table_builder.py +86 -0
cehrgpt/omop/queries/__init__.py +0 -0
cehrgpt/omop/queries/condition_era.py +86 -0
cehrgpt/omop/queries/observation_period.py +135 -0
cehrgpt/omop/sample_omop_tables.py +71 -0
cehrgpt/runners/__init__.py +0 -0
cehrgpt/runners/gpt_runner_util.py +99 -0
cehrgpt/runners/hf_cehrgpt_finetune_runner.py +746 -0
cehrgpt/runners/hf_cehrgpt_pretrain_runner.py +370 -0
cehrgpt/runners/hf_gpt_runner_argument_dataclass.py +137 -0
cehrgpt/runners/hyperparameter_search_util.py +223 -0
cehrgpt/time_to_event/__init__.py +0 -0
cehrgpt/time_to_event/config/30_day_readmission.yaml +8 -0
cehrgpt/time_to_event/config/next_visit_type_prediction.yaml +8 -0
cehrgpt/time_to_event/config/t2dm_hf.yaml +8 -0
cehrgpt/time_to_event/time_to_event_model.py +226 -0
cehrgpt/time_to_event/time_to_event_prediction.py +347 -0
cehrgpt/time_to_event/time_to_event_utils.py +55 -0
cehrgpt/tools/__init__.py +0 -0
cehrgpt/tools/ehrshot_benchmark.py +74 -0
cehrgpt/tools/generate_pretrained_embeddings.py +130 -0
cehrgpt/tools/merge_synthetic_real_dataasets.py +218 -0
cehrgpt/tools/upload_omop_tables.py +108 -0
cehrgpt-0.0.1.dist-info/LICENSE +21 -0
cehrgpt-0.0.1.dist-info/METADATA +66 -0
cehrgpt-0.0.1.dist-info/RECORD +60 -0
cehrgpt-0.0.1.dist-info/WHEEL +5 -0
cehrgpt-0.0.1.dist-info/top_level.txt +2 -0

cehrgpt/runners/hyperparameter_search_util.py ADDED Viewed

@@ -0,0 +1,223 @@
+from functools import partial
+from typing import Callable, Tuple
+import optuna
+from cehrbert.runners.hf_runner_argument_dataclass import ModelArguments
+from datasets import Dataset, DatasetDict
+from transformers import (
+    EarlyStoppingCallback,
+    Trainer,
+    TrainerCallback,
+    TrainingArguments,
+)
+from transformers.utils import logging
+from cehrgpt.data.hf_cehrgpt_dataset_collator import CehrGptDataCollator
+from cehrgpt.runners.hf_gpt_runner_argument_dataclass import CehrGPTArguments
+LOG = logging.get_logger("transformers")
+class OptunaMetricCallback(TrainerCallback):
+    """
+    A custom callback to store the best metric in the evaluation metrics dictionary during training.
+    This callback monitors the training state and updates the metrics dictionary with the `best_metric`
+    (e.g., the lowest `eval_loss` or highest accuracy) observed during training. It ensures that the
+    best metric value is preserved in the final evaluation results, even if early stopping occurs.
+    Attributes:
+        None
+    Methods:
+        on_evaluate(args, state, control, **kwargs):
+            Called during evaluation. Adds `state.best_metric` to `metrics` if it exists.
+    Example Usage:
+        ```
+        store_best_metric_callback = StoreBestMetricCallback()
+        trainer = Trainer(
+            model=model,
+            args=training_args,
+            train_dataset=train_dataset,
+            eval_dataset=val_dataset,
+            callbacks=[store_best_metric_callback]
+        )
+        ```
+    """
+    def on_evaluate(self, args, state, control, **kwargs):
+        """
+        During evaluation, adds the best metric value to the metrics dictionary if it exists.
+        Args:
+            args: Training arguments.
+            state: Trainer state object that holds information about training progress.
+            control: Trainer control object to modify training behavior.
+            **kwargs: Additional keyword arguments, including `metrics`, which holds evaluation metrics.
+        Updates:
+            `metrics["best_metric"]`: Sets this to `state.best_metric` if available.
+        """
+        # Check if best metric is available and add it to metrics if it exists
+        metrics = kwargs.get("metrics", {})
+        if state.best_metric is not None:
+            metrics.update(
+                {"optuna_best_metric": min(state.best_metric, metrics["eval_loss"])}
+            )
+        else:
+            metrics.update({"optuna_best_metric": metrics["eval_loss"]})
+# Define the hyperparameter search space with parameters
+def hp_space(
+    trial: optuna.Trial,
+    lr_range: Tuple[float, float] = (1e-5, 5e-5),
+    batch_sizes=None,
+    weight_decays: Tuple[float, float] = (1e-4, 1e-2),
+    num_train_epochs: Tuple[float, ...] = 10,
+):
+    if batch_sizes is None:
+        batch_sizes = [4, 8]
+    return {
+        "learning_rate": trial.suggest_float("learning_rate", *lr_range, log=True),
+        "per_device_train_batch_size": trial.suggest_categorical(
+            "per_device_train_batch_size", batch_sizes
+        ),
+        "weight_decay": trial.suggest_float("weight_decay", *weight_decays, log=True),
+        "num_train_epochs": trial.suggest_int("num_train_epochs", *num_train_epochs),
+    }
+def sample_dataset(data: Dataset, percentage: float, seed: int) -> Dataset:
+    """
+    Samples a subset of the given dataset based on a specified percentage.
+    This function uses a random train-test split to select a subset of the dataset, returning a sample
+    that is approximately `percentage` of the total dataset size. It is useful for creating smaller
+    datasets for tasks such as hyperparameter tuning or quick testing.
+    Args:
+        data (Dataset): The input dataset to sample from.
+        percentage (float): The fraction of the dataset to sample, represented as a decimal
+                            (e.g., 0.1 for 10%).
+        seed (int): A random seed for reproducibility in the sampling process.
+    Returns:
+        Dataset: A sampled subset of the input dataset containing `percentage` of the original data.
+    Example:
+        ```
+        sampled_data = sample_dataset(my_dataset, percentage=0.1, seed=42)
+        ```
+    Notes:
+        - The `train_test_split` method splits the dataset into "train" and "test" portions. This function
+          returns the "test" portion, which is the specified percentage of the dataset.
+        - Ensure that `percentage` is between 0 and 1 to avoid errors.
+    """
+    if percentage == 1.0:
+        return data
+    return data.train_test_split(
+        test_size=percentage,
+        seed=seed,
+    )["test"]
+def perform_hyperparameter_search(
+    model_init: Callable,
+    dataset: DatasetDict,
+    data_collator: CehrGptDataCollator,
+    training_args: TrainingArguments,
+    model_args: ModelArguments,
+    cehrgpt_args: CehrGPTArguments,
+) -> TrainingArguments:
+    """
+    Perform hyperparameter tuning for the CehrGPT model using Optuna with the Hugging Face Trainer.
+    This function initializes a Trainer with sampled training and validation sets, and performs
+    a hyperparameter search using Optuna. The search tunes learning rate, batch size, and weight decay
+    to optimize model performance based on a specified objective metric (e.g., validation loss).
+    After the search, it updates the provided `TrainingArguments` with the best hyperparameters found.
+    Args:
+        model_init (Callable): A function to initialize the model, used for each hyperparameter trial.
+        dataset (DatasetDict): A Hugging Face DatasetDict containing "train" and "validation" datasets.
+        data_collator (CehrGptDataCollator): A data collator for processing batches.
+        training_args (TrainingArguments): Configuration for training parameters (e.g., epochs, evaluation strategy).
+        model_args (ModelArguments): Model configuration arguments, including early stopping parameters.
+        cehrgpt_args (CehrGPTArguments): Additional arguments specific to CehrGPT, including hyperparameter
+                                         tuning options such as learning rate range, batch sizes, and tuning percentage.
+    Returns:
+        TrainingArguments: Updated `TrainingArguments` instance containing the best hyperparameters found
+                           from the search.
+    Example:
+        ```
+        best_training_args = perform_hyperparameter_search(
+            model_init=my_model_init,
+            dataset=my_dataset_dict,
+            data_collator=my_data_collator,
+            training_args=initial_training_args,
+            model_args=model_args,
+            cehrgpt_args=cehrgpt_args
+        )
+        ```
+    Notes:
+        - If `cehrgpt_args.hyperparameter_tuning` is set to `True`, this function samples a portion of the
+          training and validation datasets for efficient tuning.
+        - `EarlyStoppingCallback` is added to the Trainer if early stopping is enabled in `model_args`.
+        - Optuna's `hyperparameter_search` is configured with the specified number of trials (`n_trials`)
+          and learning rate and batch size ranges provided in `cehrgpt_args`.
+    Logging:
+        Logs the best hyperparameters found at the end of the search.
+    """
+    if cehrgpt_args.hyperparameter_tuning:
+        sampled_train = sample_dataset(
+            dataset["train"],
+            cehrgpt_args.hyperparameter_tuning_percentage,
+            training_args.seed,
+        )
+        sampled_val = sample_dataset(
+            dataset["validation"],
+            cehrgpt_args.hyperparameter_tuning_percentage,
+            training_args.seed,
+        )
+        hyperparam_trainer = Trainer(
+            model_init=model_init,
+            data_collator=data_collator,
+            train_dataset=sampled_train,
+            eval_dataset=sampled_val,
+            callbacks=[
+                EarlyStoppingCallback(model_args.early_stopping_patience),
+                OptunaMetricCallback(),
+            ],
+            args=training_args,
+        )
+        # Perform hyperparameter search
+        best_trial = hyperparam_trainer.hyperparameter_search(
+            direction="minimize",
+            hp_space=partial(
+                hp_space,
+                lr_range=(cehrgpt_args.lr_low, cehrgpt_args.lr_high),
+                weight_decays=(
+                    cehrgpt_args.weight_decays_low,
+                    cehrgpt_args.weight_decays_high,
+                ),
+                batch_sizes=cehrgpt_args.hyperparameter_batch_sizes,
+                num_train_epochs=cehrgpt_args.hyperparameter_num_train_epochs,
+            ),
+            backend="optuna",
+            n_trials=cehrgpt_args.n_trials,
+            compute_objective=lambda m: m["optuna_best_metric"],
+        )
+        LOG.info("Best hyperparameters: %s", best_trial.hyperparameters)
+        # Update training arguments with best hyperparameters and set epochs based on adjusted effective epochs
+        for k, v in best_trial.hyperparameters.items():
+            setattr(training_args, k, v)
+    return training_args

cehrgpt/time_to_event/__init__.py ADDED Viewed

File without changes

cehrgpt/time_to_event/config/30_day_readmission.yaml ADDED Viewed

@@ -0,0 +1,8 @@
+task_name: "30_day_readmission_prediction"
+outcome_events: ["9201", "262", "8971", "8920"]
+include_descendants: false
+future_visit_start: 0
+future_visit_end: -1
+prediction_window_start: 0
+prediction_window_end: 30
+max_new_tokens: 128

cehrgpt/time_to_event/config/next_visit_type_prediction.yaml ADDED Viewed

@@ -0,0 +1,8 @@
+task_name: "next_visit_type_prediction"
+outcome_events: [
+  '9202', '9203', '581477', '9201', '5083', '262', '38004250', '8883', '38004238', '38004251',
+  '38004222', '38004268', '38004228', '32693', '8971', '38004269', '38004193', '32036', '8782'
+]
+include_descendants: false
+future_visit_start: 0
+future_visit_end: 1

cehrgpt/time_to_event/config/t2dm_hf.yaml ADDED Viewed

@@ -0,0 +1,8 @@
+task_name: "t2dm_hf_prediction"
+outcome_events: ["316139"]
+future_visit_start: 0
+future_visit_end: -1
+prediction_window_start: 30
+prediction_window_end: -1
+max_new_tokens: 512
+include_descendants: true

cehrgpt/time_to_event/time_to_event_model.py ADDED Viewed

@@ -0,0 +1,226 @@
+import math
+from collections import Counter
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import numpy as np
+import torch
+from cehrbert_data.decorators.patient_event_decorator_base import time_month_token
+from transformers import GenerationConfig
+from cehrgpt.gpt_utils import (
+    extract_time_interval_in_days,
+    is_att_token,
+    is_visit_end,
+    is_visit_start,
+)
+from cehrgpt.models.hf_cehrgpt import CEHRGPT2LMHeadModel
+from cehrgpt.models.tokenization_hf_cehrgpt import CehrGptTokenizer
+@dataclass
+class TimeToEvent:
+    average_time: float
+    median_time: float
+    standard_deviation: float
+    most_likely_time: str
+    num_of_simulations: int
+    time_intervals: List[int]
+    outcome_events: List[str]
+    time_interval_probability_table: List[Dict[str, Any]]
+def create_time_to_event(
+    time_event_tuples: List[Tuple[str, int]], num_of_simulations: int
+) -> TimeToEvent:
+    outcome_events, time_intervals = zip(*time_event_tuples)
+    time_buckets = [time_month_token(_) for _ in time_intervals]
+    time_bucket_counter = Counter(time_buckets)
+    most_common_item = time_bucket_counter.most_common(1)[0][0]
+    total_count = sum(time_bucket_counter.values())
+    # Generate the probability table
+    probability_table = {
+        item: count / total_count for item, count in time_bucket_counter.items()
+    }
+    sorted_probability_table = [
+        {"time_interval": k, "probability": v}
+        for k, v in sorted(probability_table.items(), key=lambda x: x[1], reverse=True)
+    ]
+    return TimeToEvent(
+        time_intervals=time_intervals,
+        outcome_events=outcome_events,
+        average_time=np.mean(time_intervals),
+        median_time=np.median(time_intervals),
+        standard_deviation=np.std(time_intervals),
+        most_likely_time=most_common_item,
+        num_of_simulations=num_of_simulations,
+        time_interval_probability_table=sorted_probability_table,
+    )
+class TimeToEventModel:
+    def __init__(
+        self,
+        tokenizer: CehrGptTokenizer,
+        model: CEHRGPT2LMHeadModel,
+        outcome_events: List[str],
+        generation_config: GenerationConfig,
+        device: torch.device = torch.device("cpu"),
+        batch_size: int = 32,
+    ):
+        self.tokenizer = tokenizer
+        self.model = model.eval()
+        self.generation_config = generation_config
+        self.outcome_events = outcome_events
+        self.device = device
+        self.batch_size = batch_size
+        self.max_sequence = model.config.n_positions
+    def is_outcome_event(self, token: str):
+        return token in self.outcome_events
+    def simulate(
+        self, partial_history: Union[np.ndarray, List[str]]
+    ) -> List[List[str]]:
+        sequence_is_demographics = len(partial_history) == 4 and partial_history[
+            0
+        ].startswith("year")
+        sequence_ends_ve = is_visit_end(partial_history[-1])
+        if not (sequence_is_demographics | sequence_ends_ve):
+            raise ValueError(
+                "There are only two types of sequences allowed. 1) the sequence only contains "
+                "demographics; 2) the sequence ends on VE;"
+            )
+        token_ids = self.tokenizer.encode(partial_history)
+        prompt = torch.tensor(token_ids).unsqueeze(0).to(self.device)
+        simulated_sequences = []
+        num_iters = max(
+            math.ceil(self.generation_config.num_return_sequences / self.batch_size), 1
+        )
+        old_num_return_sequences = self.generation_config.num_return_sequences
+        self.generation_config.num_return_sequences = min(
+            self.batch_size, old_num_return_sequences
+        )
+        with torch.no_grad():
+            for _ in range(num_iters):
+                results = self.model.generate(
+                    inputs=prompt,
+                    generation_config=self.generation_config,
+                )
+                # Clear the cache
+                torch.cuda.empty_cache()
+                # Add the sequences to the result array
+                simulated_sequences.extend(
+                    [
+                        self.tokenizer.decode(seq.cpu().numpy())
+                        for seq in results.sequences
+                    ]
+                )
+        self.generation_config.num_return_sequences = old_num_return_sequences
+        return simulated_sequences
+    def predict_time_to_events(
+        self,
+        partial_history: Union[np.ndarray, list],
+        future_visit_start: int = 0,
+        future_visit_end: int = -1,
+        prediction_window_start: int = 0,
+        prediction_window_end: int = 365,
+        debug: bool = False,
+        max_n_trial: int = 2,
+    ) -> Optional[TimeToEvent]:
+        patient_history_length = len(partial_history)
+        time_event_tuples = []
+        seqs_failed_to_convert = []
+        n_trial = 0
+        num_return_sequences = self.generation_config.num_return_sequences
+        max_new_tokens = self.generation_config.max_new_tokens
+        while (
+            len(time_event_tuples) < self.generation_config.num_return_sequences
+            and n_trial < max_n_trial
+        ):
+            self.generation_config.num_return_sequences = num_return_sequences - len(
+                time_event_tuples
+            )
+            # self.generation_config.max_new_tokens = max_new_tokens * (n_trial + 1)
+            simulated_seqs = self.simulate(partial_history)
+            n_trial += 1
+            for seq in simulated_seqs:
+                visit_counter = 0
+                time_delta = 0
+                success = False
+                for next_token in seq[patient_history_length:]:
+                    visit_counter += int(is_visit_start(next_token))
+                    if (
+                        visit_counter > future_visit_end != -1
+                        or time_delta > prediction_window_end != -1
+                    ):
+                        time_event_tuples.append(("0", time_delta))
+                        success = True
+                        break
+                    if is_att_token(next_token):
+                        time_delta += extract_time_interval_in_days(next_token)
+                    elif (
+                        visit_counter >= future_visit_start
+                        and time_delta >= prediction_window_start
+                    ) and self.is_outcome_event(next_token):
+                        time_event_tuples.append((next_token, time_delta))
+                        success = True
+                        break
+                if not success:
+                    # This indicates the generated sequence did not satisfy the criteria
+                    if future_visit_end != -1 or prediction_window_end != -1:
+                        seqs_failed_to_convert.append(seq[patient_history_length:])
+                    else:
+                        time_event_tuples.append(("0", time_delta))
+        self.generation_config.num_return_sequences = num_return_sequences
+        self.generation_config.max_new_tokens = max_new_tokens
+        if debug:
+            print(f"seqs_failed_to_convert: {seqs_failed_to_convert}")
+        # Count the occurrences of each time tokens for each concept
+        return (
+            create_time_to_event(time_event_tuples, len(time_event_tuples))
+            if len(time_event_tuples) > 0
+            else None
+        )
+    @staticmethod
+    def get_generation_config(
+        tokenizer: CehrGptTokenizer,
+        max_length: int,
+        num_return_sequences: int,
+        top_p: float = 1.0,
+        top_k: int = 300,
+        temperature: float = 1.0,
+        repetition_penalty: float = 1.0,
+        epsilon_cutoff: float = 0.0,
+        max_new_tokens: int = 128,
+    ) -> GenerationConfig:
+        return GenerationConfig(
+            max_length=max_length,
+            max_new_tokens=max_new_tokens,
+            num_return_sequences=num_return_sequences,
+            temperature=temperature,
+            repetition_penalty=repetition_penalty,
+            epsilon_cutoff=epsilon_cutoff,
+            top_p=top_p,
+            top_k=top_k,
+            bos_token_id=tokenizer.end_token_id,
+            eos_token_id=tokenizer.end_token_id,
+            pad_token_id=tokenizer.pad_token_id,
+            do_sample=True,
+            use_cache=True,
+            return_dict_in_generate=True,
+            output_attentions=False,
+            output_hidden_states=False,
+            output_scores=False,
+            renormalize_logits=True,
+        )