PyPI - cehrgpt - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl - Mend

cehrgpt 0.1.0py3-none-any.whl → 0.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

cehrgpt/analysis/irregularity.py +36 -0
cehrgpt/data/hf_cehrgpt_dataset.py +1 -0
cehrgpt/data/hf_cehrgpt_dataset_collator.py +454 -68
cehrgpt/data/hf_cehrgpt_dataset_mapping.py +232 -17
cehrgpt/data/sample_packing_sampler.py +36 -6
cehrgpt/generation/cehrgpt_conditional_generation.py +314 -0
cehrgpt/generation/generate_batch_hf_gpt_sequence.py +15 -3
cehrgpt/generation/omop_converter_batch.py +32 -2
cehrgpt/gpt_utils.py +20 -2
cehrgpt/models/config.py +25 -0
cehrgpt/models/hf_cehrgpt.py +244 -39
cehrgpt/models/hf_modeling_outputs.py +1 -0
cehrgpt/models/special_tokens.py +1 -0
cehrgpt/models/tokenization_hf_cehrgpt.py +354 -71
cehrgpt/runners/data_utils.py +131 -5
cehrgpt/runners/hf_cehrgpt_finetune_runner.py +84 -51
cehrgpt/runners/hf_cehrgpt_pretrain_runner.py +59 -7
cehrgpt/runners/hf_gpt_runner_argument_dataclass.py +60 -0
cehrgpt/runners/hyperparameter_search_util.py +6 -7
cehrgpt/runners/sample_packing_trainer.py +17 -0
cehrgpt/time_to_event/config/1_year_cabg.yaml +23 -0
cehrgpt/time_to_event/time_to_event_model.py +2 -13
cehrgpt/time_to_event/time_to_event_prediction.py +27 -13
cehrgpt/tools/linear_prob/compute_cehrgpt_features.py +80 -62
{cehrgpt-0.1.0.dist-info → cehrgpt-0.1.2.dist-info}/METADATA +102 -7
{cehrgpt-0.1.0.dist-info → cehrgpt-0.1.2.dist-info}/RECORD +29 -26
{cehrgpt-0.1.0.dist-info → cehrgpt-0.1.2.dist-info}/WHEEL +1 -1
{cehrgpt-0.1.0.dist-info → cehrgpt-0.1.2.dist-info}/licenses/LICENSE +0 -0
{cehrgpt-0.1.0.dist-info → cehrgpt-0.1.2.dist-info}/top_level.txt +0 -0

cehrgpt/models/hf_cehrgpt.py CHANGED Viewed

@@ -6,7 +6,7 @@ import numpy as np
 import torch
 import torch.nn.functional as f
 from torch import nn
-from torch.distributions import Gamma, Weibull
+from torch.distributions import Exponential, Gamma
 from torch.nn import CrossEntropyLoss
 from torch.nn import functional as F
 from transformers import PreTrainedModel
@@ -102,7 +102,9 @@ def is_sample_pack(attention_mask: torch.Tensor) -> bool:
         attention_mask = attention_mask.flip(dims=[1])
     nonzero_counts = attention_mask.sum(dim=1)
-    max_token_positions = torch.argmax(attention_mask.flip(dims=[1]), dim=1)
+    max_token_positions = torch.argmax(
+        attention_mask.to(torch.int32).flip(dims=[1]), dim=1
+    )
     max_indices = attention_mask.shape[1] - 1 - max_token_positions
     return torch.any(nonzero_counts < (max_indices + 1)).item()
@@ -362,9 +364,37 @@ class GPT2FlashAttention(GPT2Attention):
         )
-class WeibullModel(nn.Module):
+class MotorTaskHead(nn.Module):
+    def __init__(self, input_dim, motor_tte_vocab_size, motor_num_time_pieces):
+        super(MotorTaskHead, self).__init__()
+        self.input_dim = input_dim
+        self.motor_tte_vocab_size = motor_tte_vocab_size
+        self.motor_num_time_pieces = motor_num_time_pieces
+        self.linear = nn.Sequential(
+            nn.Linear(input_dim, input_dim // 2),
+            gelu_new,
+            nn.Linear(
+                input_dim // 2, motor_tte_vocab_size * self.motor_num_time_pieces
+            ),
+        )
+    def forward(self, x):
+        # Ensure scale is positive
+        length = x.shape[0]
+        # (num_visits_in_batch, motor_tte_vocab_size * motor_num_time_pieces)
+        lambda_p = f.softplus(self.linear(x))
+        # Check for NaN values
+        if torch.isnan(lambda_p).any():
+            logger.warning(f"NaN values found in scale_param. x: {x}")
+        # (num_visits_in_batch,  motor_num_time_pieces, motor_tte_vocab_size,)
+        return lambda_p.view(
+            length, self.motor_num_time_pieces, self.motor_tte_vocab_size
+        )
+class VisitTimeToEventHead(nn.Module):
     def __init__(self, input_dim):
-        super(WeibullModel, self).__init__()
+        super(VisitTimeToEventHead, self).__init__()
         self.linear1 = nn.Sequential(
             nn.Linear(input_dim, input_dim // 2), gelu_new, nn.Linear(input_dim // 2, 1)
         )
@@ -661,32 +691,33 @@ class CEHRGPTPreTrainedModel(PreTrainedModel):
                 hasattr(self, "hf_quantizer") and self.hf_quantizer is not None
             )
             wpe = self.get_position_embeddings()
-            max_position, embed_dim = wpe.weight.shape
-            if new_num_position_embeddings > max_position:
-                new_embeddings = nn.Embedding(
-                    new_num_position_embeddings,
-                    embed_dim,
-                    device=wpe.weight.device,
-                    dtype=wpe.weight.dtype,
-                )
-                # initialize all new embeddings (in particular added tokens)
-                self._init_weights(new_embeddings)
-                if is_deepspeed_zero3_enabled() and not is_quantized:
-                    import deepspeed
+            if wpe is not None:
+                max_position, embed_dim = wpe.weight.shape
+                if new_num_position_embeddings > max_position:
+                    new_embeddings = nn.Embedding(
+                        new_num_position_embeddings,
+                        embed_dim,
+                        device=wpe.weight.device,
+                        dtype=wpe.weight.dtype,
+                    )
-                    params = [wpe.weight, new_embeddings.weight]
-                    with deepspeed.zero.GatheredParameters(params, modifier_rank=0):
+                    # initialize all new embeddings (in particular added tokens)
+                    self._init_weights(new_embeddings)
+                    if is_deepspeed_zero3_enabled() and not is_quantized:
+                        import deepspeed
+                        params = [wpe.weight, new_embeddings.weight]
+                        with deepspeed.zero.GatheredParameters(params, modifier_rank=0):
+                            new_embeddings.weight.data[:max_position, :] = (
+                                wpe.weight.data[:max_position, :]
+                            )
+                    else:
                         new_embeddings.weight.data[:max_position, :] = wpe.weight.data[
                             :max_position, :
                         ]
-                else:
-                    new_embeddings.weight.data[:max_position, :] = wpe.weight.data[
-                        :max_position, :
-                    ]
-                self.set_position_embeddings(new_embeddings)
-                self.config.max_position_embeddings = new_num_position_embeddings
-                self.update_attn_bias(new_num_position_embeddings)
+                    self.set_position_embeddings(new_embeddings)
+                    self.config.max_position_embeddings = new_num_position_embeddings
+                    self.update_attn_bias(new_num_position_embeddings)
 class CEHRGPT2Model(CEHRGPTPreTrainedModel):
@@ -740,6 +771,10 @@ class CEHRGPT2Model(CEHRGPTPreTrainedModel):
             )
             self.update_attn_bias(self.config.sample_packing_max_positions)
+    def enable_position_embeddings(self):
+        self.wpe = nn.Embedding(self.config.max_position_embeddings, self.embed_dim)
+        self.config.exclude_position_ids = False
     def initialize_pretrained_embeddings(self):
         layers = [
             nn.Embedding(self.config.vocab_size, self.config.pretrained_embedding_dim),
@@ -1043,7 +1078,7 @@ class CEHRGPT2Model(CEHRGPTPreTrainedModel):
             )
         if not self.exclude_position_ids:
-            position_embeds = self.wpe(position_ids)
+            position_embeds = self.wpe(position_ids).to(input_embeddings.dtype)
             hidden_states = input_embeddings + position_embeds
         else:
             hidden_states = input_embeddings
@@ -1152,7 +1187,7 @@ class CEHRGPT2LMHeadModel(CEHRGPTPreTrainedModel):
         super().__init__(config)
         self.cehrgpt = CEHRGPT2Model(config)
         if self.config.include_ttv_prediction:
-            self.tte_head = WeibullModel(config.n_embd)
+            self.tte_head = VisitTimeToEventHead(config.n_embd)
         if self.config.use_sub_time_tokenization:
             self.time_token_lm_head = nn.Linear(
@@ -1165,6 +1200,11 @@ class CEHRGPT2LMHeadModel(CEHRGPTPreTrainedModel):
                 config.n_embd, config.value_vocab_size, bias=False
             )
+        if self.config.include_motor_time_to_event:
+            self.motor_tte = MotorTaskHead(
+                config.n_embd, config.motor_tte_vocab_size, config.motor_num_time_pieces
+            )
         # Model parallel
         self.model_parallel = False
         self.device_map = None
@@ -1192,6 +1232,8 @@ class CEHRGPT2LMHeadModel(CEHRGPTPreTrainedModel):
             self.value_head = self.value_head.to(self.cehrgpt.first_device)
         if self.config.include_ttv_prediction:
             self.tte_head = self.tte_head.to(self.cehrgpt.first_device)
+        if self.config.include_motor_time_to_event:
+            self.motor_tte = self.motor_tte.to(self.cehrgpt.first_device)
         self.model_parallel = True
     def deparallelize(self):
@@ -1206,6 +1248,8 @@ class CEHRGPT2LMHeadModel(CEHRGPTPreTrainedModel):
             self.value_head = self.value_head.to("cpu")
         if self.config.include_ttv_prediction:
             self.tte_head = self.tte_head.to("cpu")
+        if self.config.include_motor_time_to_event:
+            self.motor_tte = self.motor_tte.to("cpu")
         self.model_parallel = False
         torch.cuda.empty_cache()
@@ -1233,6 +1277,28 @@ class CEHRGPT2LMHeadModel(CEHRGPTPreTrainedModel):
     def update_attn_bias(self, max_position_embeddings: int):
         self.cehrgpt.update_attn_bias(max_position_embeddings)
+    def update_motor_tte_vocab_size(
+        self, motor_tte_vocab_size: Optional[int] = None
+    ) -> None:
+        update_motor_tte_layer = False
+        if motor_tte_vocab_size and motor_tte_vocab_size > 0:
+            if self.config.include_motor_time_to_event:
+                if self.config.motor_tte_vocab_size != motor_tte_vocab_size:
+                    self.config.include_motor_time_to_event = True
+                    self.config.motor_tte_vocab_size = motor_tte_vocab_size
+                    update_motor_tte_layer = True
+            else:
+                self.config.include_motor_time_to_event = True
+                self.config.motor_tte_vocab_size = motor_tte_vocab_size
+                update_motor_tte_layer = True
+        if update_motor_tte_layer:
+            self.motor_tte = MotorTaskHead(
+                self.config.n_embd,
+                self.config.motor_tte_vocab_size,
+                self.config.motor_num_time_pieces,
+            )
     def prepare_inputs_for_generation(
         self,
         input_ids,
@@ -1328,6 +1394,74 @@ class CEHRGPT2LMHeadModel(CEHRGPTPreTrainedModel):
         return model_inputs
+    def motor_nll_loss(
+        self,
+        ve_token_features,
+        motor_time_to_event_vectors,
+        motor_event_indicators,
+        motor_time_to_event_to_include,
+        motor_time_indicators,
+        batch_motor_end_index,
+    ):
+        """
+        Computes the negative log-likelihood (NLL) loss using the LogNormal distribution.
+        for modeling time-to-event data at each visit.
+        Args:
+            ve_token_features (Tensor): Hidden representations for the [VE] tokens [num_visits, hidden_dim].
+            motor_time_to_event_vectors (Tensor): Raw time-to-event durations [B, T, motor_vocab_size] (flattened).
+            motor_time_to_event_to_include: (Tensor): Bool indicators (True if included, False if not included).
+            motor_event_indicators (Tensor): Binary indicators (1 if censored, 0 if event occurred).
+            motor_time_indicators (Tensor): Binary indicators whether the time occurs in the current
+                time bucket (1 if censored, 0 if event occurred).
+            batch_motor_end_index (Tensor): Tensor indicating the number of valid [VE] tokens in the batch.
+        Returns:
+            Tensor: Scalar loss value (mean negative log-likelihood).
+        """
+        batch_motor_end_index = batch_motor_end_index.sum().item()
+        motor_time_to_event_vectors = motor_time_to_event_vectors.view(
+            (-1, self.config.motor_num_time_pieces, self.config.motor_tte_vocab_size)
+        )[:batch_motor_end_index].clamp(min=1e-3)
+        motor_event_indicators = motor_event_indicators.reshape(
+            (-1, self.config.motor_num_time_pieces, self.config.motor_tte_vocab_size)
+        )[:batch_motor_end_index]
+        motor_time_to_event_to_include = motor_time_to_event_to_include.flatten()[
+            :batch_motor_end_index
+        ]
+        motor_time_indicators = motor_time_indicators.view(
+            (-1, self.config.motor_num_time_pieces, self.config.motor_tte_vocab_size)
+        )[:batch_motor_end_index]
+        assert ve_token_features.shape[0] == motor_time_to_event_vectors.shape[0], (
+            "The number of VE tokens in the labels needs to match up "
+            "with the first dimension of motor_time_to_event_vectors. "
+            f"Received ve_token_features.shape[0]: {ve_token_features.shape[0]}, "
+            f"motor_time_to_event_vectors.shape[0]: {motor_time_to_event_vectors.shape[0]}"
+        )
+        motor_time_to_event_vectors = motor_time_to_event_vectors[
+            motor_time_to_event_to_include
+        ]
+        motor_event_indicators = motor_event_indicators[motor_time_to_event_to_include]
+        motor_time_indicators = motor_time_indicators[motor_time_to_event_to_include]
+        ve_token_features = ve_token_features[motor_time_to_event_to_include]
+        # Get Exponential parameters from model
+        lambda_p = self.motor_tte(ve_token_features)
+        # (num_visits_in_batch, num_of_pieces, motor_vocab_size)
+        dist = Exponential(lambda_p.clamp(min=1e-3))
+        # Compute event loss
+        tte_loss = torch.where(
+            motor_event_indicators,
+            -dist.log_prob(motor_time_to_event_vectors),
+            -torch.log(
+                1 - dist.cdf(motor_time_to_event_vectors).clamp(max=1 - 1e-6) + 1e-6
+            ),
+        )
+        tte_loss = torch.where(motor_time_indicators, tte_loss, 0.0)
+        return torch.mean(tte_loss)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -1344,6 +1478,11 @@ class CEHRGPT2LMHeadModel(CEHRGPTPreTrainedModel):
         time_to_visits: Optional[torch.FloatTensor] = None,
         time_token_indicators: Optional[torch.BoolTensor] = None,
         sub_time_tokens: Optional[torch.LongTensor] = None,
+        motor_time_to_event_vectors: Optional[torch.FloatTensor] = None,
+        motor_event_indicators: Optional[torch.BoolTensor] = None,
+        motor_time_to_event_to_include: Optional[torch.BoolTensor] = None,
+        motor_time_indicators: Optional[torch.BoolTensor] = None,
+        motor_end_index: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1403,6 +1542,8 @@ class CEHRGPT2LMHeadModel(CEHRGPTPreTrainedModel):
         time_token_loss = None
         time_to_visit_loss = None
         token_value_loss = None
+        motor_tte_loss = None
         if labels is not None:
             # move labels to correct device to enable model parallelism
             labels = labels.to(lm_logits.device)
@@ -1470,9 +1611,35 @@ class CEHRGPT2LMHeadModel(CEHRGPTPreTrainedModel):
                 entropy_penalty = entropy.sum() / total_num_tokens
                 loss += entropy_penalty * self.cehrgpt.config.entropy_penalty_alpha
+            if (
+                self.config.include_motor_time_to_event
+                and motor_time_to_event_vectors is not None
+                and motor_event_indicators is not None
+                and motor_time_to_event_to_include is not None
+                and motor_time_indicators is not None
+                and motor_end_index is not None
+            ):
+                ve_token_id_indices = labels == self.config.ve_token_id
+                ve_token_features = hidden_states[ve_token_id_indices]
+                # Get rid of the last VE features because it's already reached the end of the patient sequence and
+                # there is nothing to predict.
+                motor_tte_loss = self.motor_nll_loss(
+                    ve_token_features=ve_token_features,
+                    motor_time_to_event_vectors=motor_time_to_event_vectors,
+                    motor_event_indicators=motor_event_indicators,
+                    motor_time_to_event_to_include=motor_time_to_event_to_include,
+                    motor_time_indicators=motor_time_indicators,
+                    batch_motor_end_index=motor_end_index,
+                )
+                loss += motor_tte_loss * self.config.motor_time_to_event_weight
             # We add another loss term when use_sub_time_tokenization is enabled, we need to recover the sub time token
             # predictions for year/month/token
-            if self.config.use_sub_time_tokenization:
+            if (
+                self.config.use_sub_time_tokenization
+                and sub_time_tokens is not None
+                and time_token_indicators is not None
+            ):
                 # Split the last dimensions into three parts
                 time_loss_fct = CrossEntropyLoss(reduction="none")
                 time_token_logits = self.time_token_lm_head(
@@ -1501,7 +1668,7 @@ class CEHRGPT2LMHeadModel(CEHRGPTPreTrainedModel):
                 time_token_loss = time_token_loss.sum() / total_num_tokens
                 loss += time_token_loss * self.config.time_token_loss_weight
-            if time_to_visits is not None:
+            if time_to_visits is not None and time_to_visits is not None:
                 # Get lambda and k parameters
                 lambda_param, k_param = self.tte_head(hidden_states)
@@ -1512,14 +1679,15 @@ class CEHRGPT2LMHeadModel(CEHRGPTPreTrainedModel):
                 # Move to the same device as lambda_param
                 shift_time_to_visits = shift_time_to_visits.to(lambda_param.device)
                 time_to_visit_indicator = shift_time_to_visits >= 0
                 # Define the Gamma distribution
                 dist = Gamma(
                     shifted_k_param.squeeze(-1), shifted_lambda_param.squeeze(-1)
                 )
                 # Compute log-probs and apply the time_to_visit_indicator
-                log_probs = dist.log_prob(torch.clamp(shift_time_to_visits, min=1e-3))
+                log_probs = dist.log_prob(
+                    torch.clamp(shift_time_to_visits, min=1e-3) + 1e-6
+                )
                 log_probs = torch.where(time_to_visit_indicator, log_probs, 0)
                 time_to_visit_loss = -log_probs.sum() / total_num_tokens
                 # Compute the loss
@@ -1564,6 +1732,7 @@ class CEHRGPT2LMHeadModel(CEHRGPTPreTrainedModel):
             time_token_loss=time_token_loss,
             time_to_visit_loss=time_to_visit_loss,
             token_value_loss=token_value_loss,
+            motor_tte_loss=motor_tte_loss,
         )
     @staticmethod
@@ -1681,6 +1850,7 @@ class CEHRGPT2LMHeadModel(CEHRGPTPreTrainedModel):
         # keep track of which sequences are already finished
         batch_size, cur_len = input_ids.shape
+        model_kwargs["attention_mask"] = input_ids != pad_token_id
         if "inputs_embeds" in model_kwargs:
             cur_len = model_kwargs["inputs_embeds"].shape[1]
         this_peer_finished = False
@@ -1699,11 +1869,19 @@ class CEHRGPT2LMHeadModel(CEHRGPTPreTrainedModel):
                 [] if self.config.lab_token_ids is None else self.config.lab_token_ids,
                 dtype=torch.int32,
             )
-        value_indicators = torch.zeros_like(input_ids).to(torch.bool)
-        values = torch.zeros_like(
-            input_ids,
-            dtype=torch.int32,
-        )
+        if model_kwargs.get("value_indicators", None) is not None:
+            value_indicators = model_kwargs.get("value_indicators")
+        else:
+            value_indicators = torch.zeros_like(input_ids).to(torch.bool)
+        if model_kwargs.get("values", None) is not None:
+            values = model_kwargs.get("values")
+        else:
+            values = torch.zeros_like(
+                input_ids,
+                dtype=torch.int32,
+            )
         # Generate initial random_vectors
         if self.cehrgpt.config.causal_sfm:
             model_kwargs["random_vectors"] = torch.rand(
@@ -1837,6 +2015,27 @@ class CEHRGPT2LMHeadModel(CEHRGPTPreTrainedModel):
         )
+class FocalLoss(nn.Module):
+    def __init__(self, alpha=0.25, gamma=2.0, reduction="mean"):
+        super().__init__()
+        self.alpha = alpha
+        self.gamma = gamma
+        self.reduction = reduction
+    def forward(self, logits, targets):
+        bce_loss = F.binary_cross_entropy_with_logits(logits, targets, reduction="none")
+        probs = torch.sigmoid(logits)
+        pt = torch.where(targets == 1, probs, 1 - probs)
+        focal_term = (1 - pt) ** self.gamma
+        loss = self.alpha * focal_term * bce_loss
+        if self.reduction == "mean":
+            return loss.mean()
+        elif self.reduction == "sum":
+            return loss.sum()
+        return loss
 class CehrGptForClassification(CEHRGPTPreTrainedModel):
     _keep_in_fp32_modules = ["age_batch_norm", "dense_layer", "classifier"]
@@ -1859,7 +2058,6 @@ class CehrGptForClassification(CEHRGPTPreTrainedModel):
         self.model_parallel = False
         self.device_map = None
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
         self.post_init()
@@ -1971,7 +2169,14 @@ class CehrGptForClassification(CEHRGPTPreTrainedModel):
         loss = None
         if classifier_label is not None:
-            loss_fct = nn.BCEWithLogitsLoss()
+            if self.config.class_weights:
+                class_weights = torch.tensor(
+                    [self.config.class_weights[1] / self.config.class_weights[0]],
+                    dtype=torch.float32,
+                ).to(logits.device)
+            else:
+                class_weights = None
+            loss_fct = nn.BCEWithLogitsLoss(pos_weight=class_weights)
             loss = loss_fct(logits, classifier_label)
         return CehrGptSequenceClassifierOutput(

cehrgpt/models/hf_modeling_outputs.py CHANGED Viewed

@@ -85,6 +85,7 @@ class CehrGptCausalLMOutput(ModelOutput):
     time_token_loss: Optional[torch.FloatTensor] = None
     time_to_visit_loss: Optional[torch.FloatTensor] = None
     token_value_loss: Optional[torch.FloatTensor] = None
+    motor_tte_loss: Optional[torch.FloatTensor] = None
 @dataclass

cehrgpt/models/special_tokens.py CHANGED Viewed

@@ -3,6 +3,7 @@ START_TOKEN = "[START]"
 END_TOKEN = "[END]"
 PAD_TOKEN = "[PAD]"
 OUT_OF_VOCABULARY_TOKEN = "[OOV]"
+LINEAR_PROB_TOKEN = "[LINEAR_PROB]"
 # OMOP CONCEPT IDs
 VISIT_CONCEPT_IDS = [

cehrgpt 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

cehrgpt 0.1.0py3-none-any.whl → 0.1.2py3-none-any.whl