PyPI - cehrgpt - Versions diffs - 0.0.2__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

cehrgpt 0.0.2py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

cehrgpt/analysis/irregularity.py +36 -0
cehrgpt/data/hf_cehrgpt_dataset.py +25 -4
cehrgpt/data/hf_cehrgpt_dataset_collator.py +635 -97
cehrgpt/data/hf_cehrgpt_dataset_mapping.py +308 -95
cehrgpt/data/sample_packing_sampler.py +181 -0
cehrgpt/generation/generate_batch_hf_gpt_sequence.py +12 -9
cehrgpt/generation/omop_converter_batch.py +32 -2
cehrgpt/gpt_utils.py +20 -2
cehrgpt/models/config.py +35 -0
cehrgpt/models/hf_cehrgpt.py +470 -106
cehrgpt/models/hf_modeling_outputs.py +1 -0
cehrgpt/models/special_tokens.py +1 -0
cehrgpt/models/tokenization_hf_cehrgpt.py +358 -71
cehrgpt/runners/data_utils.py +358 -0
cehrgpt/runners/gpt_runner_util.py +0 -10
cehrgpt/runners/hf_cehrgpt_finetune_runner.py +181 -283
cehrgpt/runners/hf_cehrgpt_pretrain_runner.py +288 -112
cehrgpt/runners/hf_gpt_runner_argument_dataclass.py +90 -0
cehrgpt/runners/hyperparameter_search_util.py +10 -8
cehrgpt/runners/sample_packing_trainer.py +185 -0
cehrgpt/simulations/generate_plots.py +95 -0
cehrgpt/simulations/run_simulation.sh +24 -0
cehrgpt/simulations/time_embedding_simulation.py +250 -0
cehrgpt/simulations/time_token_simulation.py +177 -0
cehrgpt/time_to_event/config/1_year_cabg.yaml +23 -0
cehrgpt/time_to_event/time_to_event_model.py +2 -13
cehrgpt/time_to_event/time_to_event_prediction.py +27 -13
cehrgpt/tools/linear_prob/__init__.py +0 -0
cehrgpt/tools/linear_prob/compute_cehrgpt_features.py +495 -0
cehrgpt/tools/linear_prob/train_with_cehrgpt_features.py +152 -0
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info}/METADATA +11 -8
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info}/RECORD +36 -32
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info}/WHEEL +1 -1
cehrgpt/data/hf_cehrgpt_dpo_collator.py +0 -71
cehrgpt/data/hf_cehrgpt_dpo_dataset_mapping.py +0 -61
cehrgpt/generation/generate_paired_cehrgpt_sequence.py +0 -224
cehrgpt/rl_finetune/cehrgpt_dpo_trainer.py +0 -586
cehrgpt/rl_finetune/cehrgpt_ppo_trainer.py +0 -464
cehrgpt/rl_finetune/ppo_finetune.py +0 -394
cehrgpt/rl_finetune/ppo_finetune_v2.py +0 -373
cehrgpt/runners/hf_cehrgpt_dpo_runner.py +0 -119
/cehrgpt/{rl_finetune → simulations}/__init__.py +0 -0
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info/licenses}/LICENSE +0 -0
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info}/top_level.txt +0 -0

cehrgpt/models/hf_cehrgpt.py CHANGED Viewed

@@ -6,7 +6,7 @@ import numpy as np
 import torch
 import torch.nn.functional as f
 from torch import nn
-from torch.distributions import Gamma
+from torch.distributions import Exponential, Gamma
 from torch.nn import CrossEntropyLoss
 from torch.nn import functional as F
 from transformers import PreTrainedModel
@@ -45,12 +45,108 @@ if is_accelerate_available():
 logger = logging.get_logger(__name__)
+def extract_features_from_packed_sequence(
+    hidden_state: torch.Tensor,
+    attention_mask: torch.Tensor,
+) -> torch.Tensor:
+    max_index = attention_mask.nonzero(as_tuple=False).flatten()[-1]
+    padded_attention_mask = F.pad(attention_mask[:, : max_index + 1], (0, 1))
+    feature_indices = torch.nonzero(padded_attention_mask == 0)[:, 1] - 1
+    return hidden_state[:, feature_indices]
+def create_sample_packing_attention_mask(attention_mask: torch.Tensor) -> torch.Tensor:
+    """
+    Create a block-diagonal attention mask for packed sequences within a batch.
+    Args:
+        attention_mask (torch.Tensor): (batch_size, seq_len) binary mask where 1 = token, 0 = padding
+    Returns:
+        torch.Tensor: (batch_size, seq_len, seq_len) attention mask where entries are 1 if tokens
+                      can attend to each other (within same packed segment), 0 otherwise.
+    """
+    # Step 1: Identify segments within each sample
+    cumsum_mask = (attention_mask == 0).cumsum(dim=-1)
+    segment_ids = cumsum_mask * attention_mask  # zeros remain zero
+    # Step 2: Compare segment IDs pairwise per batch element
+    # Shape: (batch_size, seq_len, seq_len)
+    attn_matrix = (segment_ids.unsqueeze(2) == segment_ids.unsqueeze(1)).int()
+    # Step 3: Mask out padding tokens
+    mask = attention_mask.unsqueeze(1) * attention_mask.unsqueeze(2)
+    attn_matrix = attn_matrix * mask
+    return attn_matrix
+def is_sample_pack(attention_mask: torch.Tensor) -> bool:
+    """
+    Determines whether any sequence in the batch is likely sample-packed.
+    A sample-packed sequence is one where there are non-padding (1) tokens
+    after a padding (0) token, indicating multiple sequences packed together
+    with padding as a separator.
+    Args:
+        attention_mask (torch.Tensor): A tensor of shape (batch_size, seq_len)
+            where 1 indicates a real token and 0 indicates padding.
+    Returns:
+        bool: True if any sample in the batch is sample-packed, False otherwise.
+    """
+    # If the attention_maks is left padded, we will flip it so we can use the same logic below
+    if (attention_mask[:, 0] == 0).any():
+        attention_mask = attention_mask.flip(dims=[1])
+    nonzero_counts = attention_mask.sum(dim=1)
+    max_token_positions = torch.argmax(attention_mask.flip(dims=[1]), dim=1)
+    max_indices = attention_mask.shape[1] - 1 - max_token_positions
+    return torch.any(nonzero_counts < (max_indices + 1)).item()
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
 def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    # This infers sample packing
+    if is_sample_pack(attention_mask):
+        # Assume input: attention_mask shape = (batch, seq_len)
+        attention_mask = attention_mask.flatten()  # shape: (seq_len,)
+        # Compute max_index of the last non-zero element
+        nonzero = torch.nonzero(attention_mask, as_tuple=False).flatten()
+        max_index = nonzero[-1].item()
+        # Pad the truncated attention mask
+        padded_attention_mask = F.pad(attention_mask[: max_index + 1], (0, 1), value=0)
+        # Indices of all tokens
+        indices = torch.nonzero(attention_mask, as_tuple=False).flatten()
+        # Find where 0s occur (segment boundaries)
+        cumsum_seqlens_in_batch = torch.cumsum(padded_attention_mask, dim=0)[
+            padded_attention_mask == 0
+        ]
+        # Compute seqlens per segment
+        seqlens_in_batch = (
+            cumsum_seqlens_in_batch
+            - F.pad(cumsum_seqlens_in_batch, (1, 0), value=0)[:-1]
+        ).to(torch.int)
+        max_seqlen_in_batch = (
+            seqlens_in_batch.max().item() if seqlens_in_batch.numel() > 0 else 0
+        )
+        cu_seqlens = F.pad(cumsum_seqlens_in_batch, (1, 0)).to(torch.int)
+    else:
+        seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+        indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+        max_seqlen_in_batch = seqlens_in_batch.max().item()
+        cu_seqlens = F.pad(
+            torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)
+        )
     return (
         indices,
         cu_seqlens,
@@ -266,9 +362,37 @@ class GPT2FlashAttention(GPT2Attention):
         )
-class WeibullModel(nn.Module):
+class MotorTaskHead(nn.Module):
+    def __init__(self, input_dim, motor_tte_vocab_size, motor_num_time_pieces):
+        super(MotorTaskHead, self).__init__()
+        self.input_dim = input_dim
+        self.motor_tte_vocab_size = motor_tte_vocab_size
+        self.motor_num_time_pieces = motor_num_time_pieces
+        self.linear = nn.Sequential(
+            nn.Linear(input_dim, input_dim // 2),
+            gelu_new,
+            nn.Linear(
+                input_dim // 2, motor_tte_vocab_size * self.motor_num_time_pieces
+            ),
+        )
+    def forward(self, x):
+        # Ensure scale is positive
+        length = x.shape[0]
+        # (num_visits_in_batch, motor_tte_vocab_size * motor_num_time_pieces)
+        lambda_p = f.softplus(self.linear(x))
+        # Check for NaN values
+        if torch.isnan(lambda_p).any():
+            logger.warning(f"NaN values found in scale_param. x: {x}")
+        # (num_visits_in_batch,  motor_num_time_pieces, motor_tte_vocab_size,)
+        return lambda_p.view(
+            length, self.motor_num_time_pieces, self.motor_tte_vocab_size
+        )
+class VisitTimeToEventHead(nn.Module):
     def __init__(self, input_dim):
-        super(WeibullModel, self).__init__()
+        super(VisitTimeToEventHead, self).__init__()
         self.linear1 = nn.Sequential(
             nn.Linear(input_dim, input_dim // 2), gelu_new, nn.Linear(input_dim // 2, 1)
         )
@@ -565,32 +689,33 @@ class CEHRGPTPreTrainedModel(PreTrainedModel):
                 hasattr(self, "hf_quantizer") and self.hf_quantizer is not None
             )
             wpe = self.get_position_embeddings()
-            max_position, embed_dim = wpe.weight.shape
-            if new_num_position_embeddings > max_position:
-                new_embeddings = nn.Embedding(
-                    new_num_position_embeddings,
-                    embed_dim,
-                    device=wpe.weight.device,
-                    dtype=wpe.weight.dtype,
-                )
-                # initialize all new embeddings (in particular added tokens)
-                self._init_weights(new_embeddings)
-                if is_deepspeed_zero3_enabled() and not is_quantized:
-                    import deepspeed
+            if wpe is not None:
+                max_position, embed_dim = wpe.weight.shape
+                if new_num_position_embeddings > max_position:
+                    new_embeddings = nn.Embedding(
+                        new_num_position_embeddings,
+                        embed_dim,
+                        device=wpe.weight.device,
+                        dtype=wpe.weight.dtype,
+                    )
-                    params = [wpe.weight, new_embeddings.weight]
-                    with deepspeed.zero.GatheredParameters(params, modifier_rank=0):
+                    # initialize all new embeddings (in particular added tokens)
+                    self._init_weights(new_embeddings)
+                    if is_deepspeed_zero3_enabled() and not is_quantized:
+                        import deepspeed
+                        params = [wpe.weight, new_embeddings.weight]
+                        with deepspeed.zero.GatheredParameters(params, modifier_rank=0):
+                            new_embeddings.weight.data[:max_position, :] = (
+                                wpe.weight.data[:max_position, :]
+                            )
+                    else:
                         new_embeddings.weight.data[:max_position, :] = wpe.weight.data[
                             :max_position, :
                         ]
-                else:
-                    new_embeddings.weight.data[:max_position, :] = wpe.weight.data[
-                        :max_position, :
-                    ]
-                self.set_position_embeddings(new_embeddings)
-                self.config.max_position_embeddings = new_num_position_embeddings
-                self.update_attn_bias(new_num_position_embeddings)
+                    self.set_position_embeddings(new_embeddings)
+                    self.config.max_position_embeddings = new_num_position_embeddings
+                    self.update_attn_bias(new_num_position_embeddings)
 class CEHRGPT2Model(CEHRGPTPreTrainedModel):
@@ -609,7 +734,8 @@ class CEHRGPT2Model(CEHRGPTPreTrainedModel):
             self.pretrained_wte = None
         self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
-        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
+        if not self.exclude_position_ids:
+            self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
         if self.include_values:
             self.vte = nn.Embedding(config.value_vocab_size, self.embed_dim)
             self.concept_value_transformation_layer = ConceptValueTransformationLayer(
@@ -635,6 +761,18 @@ class CEHRGPT2Model(CEHRGPTPreTrainedModel):
         # Initialize weights and apply final processing
         self.post_init()
+        # We do need to update the pre-computed attention bias matrix if sample packing requires a larger context window
+        if self.config.sample_packing_max_positions > self.config.n_positions:
+            logger.info(
+                "Updated attn_bias to %s according to sample_packing_max_positions",
+                config.sample_packing_max_positions,
+            )
+            self.update_attn_bias(self.config.sample_packing_max_positions)
+    def enable_position_embeddings(self):
+        self.wpe = nn.Embedding(self.config.max_position_embeddings, self.embed_dim)
+        self.config.exclude_position_ids = False
     def initialize_pretrained_embeddings(self):
         layers = [
             nn.Embedding(self.config.vocab_size, self.config.pretrained_embedding_dim),
@@ -677,7 +815,8 @@ class CEHRGPT2Model(CEHRGPTPreTrainedModel):
         self.wte = self.wte.to(self.first_device)
         if self.config.use_pretrained_embeddings:
             self.pretrained_wte = self.pretrained_wte.to(self.first_device)
-        self.wpe = self.wpe.to(self.first_device)
+        if not self.exclude_position_ids:
+            self.wpe = self.wpe.to(self.first_device)
         if self.include_values:
             self.vte = self.vte.to(self.first_device)
             self.concept_value_transformation_layer = (
@@ -703,7 +842,8 @@ class CEHRGPT2Model(CEHRGPTPreTrainedModel):
         self.wte = self.wte.to("cpu")
         if self.config.use_pretrained_embeddings:
             self.pretrained_wte = self.pretrained_wte.to("cpu")
-        self.wpe = self.wpe.to("cpu")
+        if not self.exclude_position_ids:
+            self.wpe = self.wpe.to("cpu")
         self.vte = self.vte.to("cpu")
         self.concept_value_transformation_layer = (
             self.concept_value_transformation_layer.to("cpu")
@@ -728,8 +868,12 @@ class CEHRGPT2Model(CEHRGPTPreTrainedModel):
                 persistent=False,
             )
-    def get_position_embeddings(self) -> Union[nn.Embedding, Tuple[nn.Embedding]]:
-        return self.wpe
+    def get_position_embeddings(
+        self,
+    ) -> Optional[Union[nn.Embedding, Tuple[nn.Embedding]]]:
+        if not self.exclude_position_ids:
+            return self.wpe
+        return None
     def set_position_embeddings(self, new_embeddings: nn.Embedding):
         self.wpe = new_embeddings
@@ -758,8 +902,8 @@ class CEHRGPT2Model(CEHRGPTPreTrainedModel):
     def forward(
         self,
         input_ids: Optional[torch.LongTensor],
-        value_indicators: Optional[torch.BoolTensor],
-        values: Optional[torch.LongTensor],
+        value_indicators: Optional[torch.BoolTensor] = None,
+        values: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
@@ -850,12 +994,19 @@ class CEHRGPT2Model(CEHRGPTPreTrainedModel):
                 == "flash_attention_2"
             ):
                 attention_mask = attention_mask.view(batch_size, -1)
-                # We create a 3D attention mask from a 2D tensor mask.
-                # Sizes are [batch_size, 1, 1, to_seq_length]
-                # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-                # this attention mask is more simple than the triangular masking of causal attention
-                # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-                attention_mask = attention_mask[:, None, None, :]
+                # If this is sample packing, we need to great the
+                if is_sample_pack(attention_mask):
+                    attention_mask = create_sample_packing_attention_mask(
+                        attention_mask
+                    )[:, None, :, :]
+                else:
+                    # We create a 3D attention mask from a 2D tensor mask.
+                    # Sizes are [batch_size, 1, 1, to_seq_length]
+                    # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+                    # this attention mask is more simple than the triangular masking of causal attention
+                    # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+                    attention_mask = attention_mask[:, None, None, :]
                 # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
                 # masked positions, this operation will create a tensor which is 0.0 for
@@ -925,7 +1076,7 @@ class CEHRGPT2Model(CEHRGPTPreTrainedModel):
             )
         if not self.exclude_position_ids:
-            position_embeds = self.wpe(position_ids)
+            position_embeds = self.wpe(position_ids).to(input_embeddings.dtype)
             hidden_states = input_embeddings + position_embeds
         else:
             hidden_states = input_embeddings
@@ -1034,7 +1185,7 @@ class CEHRGPT2LMHeadModel(CEHRGPTPreTrainedModel):
         super().__init__(config)
         self.cehrgpt = CEHRGPT2Model(config)
         if self.config.include_ttv_prediction:
-            self.tte_head = WeibullModel(config.n_embd)
+            self.tte_head = VisitTimeToEventHead(config.n_embd)
         if self.config.use_sub_time_tokenization:
             self.time_token_lm_head = nn.Linear(
@@ -1047,6 +1198,11 @@ class CEHRGPT2LMHeadModel(CEHRGPTPreTrainedModel):
                 config.n_embd, config.value_vocab_size, bias=False
             )
+        if self.config.include_motor_time_to_event:
+            self.motor_tte = MotorTaskHead(
+                config.n_embd, config.motor_tte_vocab_size, config.motor_num_time_pieces
+            )
         # Model parallel
         self.model_parallel = False
         self.device_map = None
@@ -1074,6 +1230,8 @@ class CEHRGPT2LMHeadModel(CEHRGPTPreTrainedModel):
             self.value_head = self.value_head.to(self.cehrgpt.first_device)
         if self.config.include_ttv_prediction:
             self.tte_head = self.tte_head.to(self.cehrgpt.first_device)
+        if self.config.include_motor_time_to_event:
+            self.motor_tte = self.motor_tte.to(self.cehrgpt.first_device)
         self.model_parallel = True
     def deparallelize(self):
@@ -1088,6 +1246,8 @@ class CEHRGPT2LMHeadModel(CEHRGPTPreTrainedModel):
             self.value_head = self.value_head.to("cpu")
         if self.config.include_ttv_prediction:
             self.tte_head = self.tte_head.to("cpu")
+        if self.config.include_motor_time_to_event:
+            self.motor_tte = self.motor_tte.to("cpu")
         self.model_parallel = False
         torch.cuda.empty_cache()
@@ -1115,6 +1275,28 @@ class CEHRGPT2LMHeadModel(CEHRGPTPreTrainedModel):
     def update_attn_bias(self, max_position_embeddings: int):
         self.cehrgpt.update_attn_bias(max_position_embeddings)
+    def update_motor_tte_vocab_size(
+        self, motor_tte_vocab_size: Optional[int] = None
+    ) -> None:
+        update_motor_tte_layer = False
+        if motor_tte_vocab_size and motor_tte_vocab_size > 0:
+            if self.config.include_motor_time_to_event:
+                if self.config.motor_tte_vocab_size != motor_tte_vocab_size:
+                    self.config.include_motor_time_to_event = True
+                    self.config.motor_tte_vocab_size = motor_tte_vocab_size
+                    update_motor_tte_layer = True
+            else:
+                self.config.include_motor_time_to_event = True
+                self.config.motor_tte_vocab_size = motor_tte_vocab_size
+                update_motor_tte_layer = True
+        if update_motor_tte_layer:
+            self.motor_tte = MotorTaskHead(
+                self.config.n_embd,
+                self.config.motor_tte_vocab_size,
+                self.config.motor_num_time_pieces,
+            )
     def prepare_inputs_for_generation(
         self,
         input_ids,
@@ -1210,6 +1392,74 @@ class CEHRGPT2LMHeadModel(CEHRGPTPreTrainedModel):
         return model_inputs
+    def motor_nll_loss(
+        self,
+        ve_token_features,
+        motor_time_to_event_vectors,
+        motor_event_indicators,
+        motor_time_to_event_to_include,
+        motor_time_indicators,
+        batch_motor_end_index,
+    ):
+        """
+        Computes the negative log-likelihood (NLL) loss using the LogNormal distribution.
+        for modeling time-to-event data at each visit.
+        Args:
+            ve_token_features (Tensor): Hidden representations for the [VE] tokens [num_visits, hidden_dim].
+            motor_time_to_event_vectors (Tensor): Raw time-to-event durations [B, T, motor_vocab_size] (flattened).
+            motor_time_to_event_to_include: (Tensor): Bool indicators (True if included, False if not included).
+            motor_event_indicators (Tensor): Binary indicators (1 if censored, 0 if event occurred).
+            motor_time_indicators (Tensor): Binary indicators whether the time occurs in the current
+                time bucket (1 if censored, 0 if event occurred).
+            batch_motor_end_index (Tensor): Tensor indicating the number of valid [VE] tokens in the batch.
+        Returns:
+            Tensor: Scalar loss value (mean negative log-likelihood).
+        """
+        batch_motor_end_index = batch_motor_end_index.sum().item()
+        motor_time_to_event_vectors = motor_time_to_event_vectors.view(
+            (-1, self.config.motor_num_time_pieces, self.config.motor_tte_vocab_size)
+        )[:batch_motor_end_index].clamp(min=1e-3)
+        motor_event_indicators = motor_event_indicators.reshape(
+            (-1, self.config.motor_num_time_pieces, self.config.motor_tte_vocab_size)
+        )[:batch_motor_end_index]
+        motor_time_to_event_to_include = motor_time_to_event_to_include.flatten()[
+            :batch_motor_end_index
+        ]
+        motor_time_indicators = motor_time_indicators.view(
+            (-1, self.config.motor_num_time_pieces, self.config.motor_tte_vocab_size)
+        )[:batch_motor_end_index]
+        assert ve_token_features.shape[0] == motor_time_to_event_vectors.shape[0], (
+            "The number of VE tokens in the labels needs to match up "
+            "with the first dimension of motor_time_to_event_vectors. "
+            f"Received ve_token_features.shape[0]: {ve_token_features.shape[0]}, "
+            f"motor_time_to_event_vectors.shape[0]: {motor_time_to_event_vectors.shape[0]}"
+        )
+        motor_time_to_event_vectors = motor_time_to_event_vectors[
+            motor_time_to_event_to_include
+        ]
+        motor_event_indicators = motor_event_indicators[motor_time_to_event_to_include]
+        motor_time_indicators = motor_time_indicators[motor_time_to_event_to_include]
+        ve_token_features = ve_token_features[motor_time_to_event_to_include]
+        # Get Exponential parameters from model
+        lambda_p = self.motor_tte(ve_token_features)
+        # (num_visits_in_batch, num_of_pieces, motor_vocab_size)
+        dist = Exponential(lambda_p.clamp(min=1e-3))
+        # Compute event loss
+        tte_loss = torch.where(
+            motor_event_indicators,
+            -dist.log_prob(motor_time_to_event_vectors),
+            -torch.log(
+                1 - dist.cdf(motor_time_to_event_vectors).clamp(max=1 - 1e-6) + 1e-6
+            ),
+        )
+        tte_loss = torch.where(motor_time_indicators, tte_loss, 0.0)
+        return torch.mean(tte_loss)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -1226,6 +1476,11 @@ class CEHRGPT2LMHeadModel(CEHRGPTPreTrainedModel):
         time_to_visits: Optional[torch.FloatTensor] = None,
         time_token_indicators: Optional[torch.BoolTensor] = None,
         sub_time_tokens: Optional[torch.LongTensor] = None,
+        motor_time_to_event_vectors: Optional[torch.FloatTensor] = None,
+        motor_event_indicators: Optional[torch.BoolTensor] = None,
+        motor_time_to_event_to_include: Optional[torch.BoolTensor] = None,
+        motor_time_indicators: Optional[torch.BoolTensor] = None,
+        motor_end_index: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1285,12 +1540,31 @@ class CEHRGPT2LMHeadModel(CEHRGPTPreTrainedModel):
         time_token_loss = None
         time_to_visit_loss = None
         token_value_loss = None
+        motor_tte_loss = None
         if labels is not None:
             # move labels to correct device to enable model parallelism
             labels = labels.to(lm_logits.device)
+            if self.config.causal_sfm:
+                # Ensure demographic_labels matches the dtype of original labels
+                demographic_labels = torch.full(
+                    (labels.shape[0], self.config.demographics_size),
+                    -100,
+                    dtype=labels.dtype,  # Match the original labels' dtype
+                    device=labels.device,  # Ensure on the same device
+                )
+                # Concatenate the demographic labels with the rest of the original labels
+                labels = torch.cat(
+                    (demographic_labels, labels[:, self.config.demographics_size :]),
+                    dim=1,
+                )
             # Shift so that tokens < n predict n
             shift_logits = lm_logits[..., :-1, :].contiguous()
             shift_labels = labels[..., 1:].contiguous()
+            valid_tokens: torch.BoolTensor = shift_labels != 100
+            total_num_tokens = valid_tokens.sum()
             if (
                 self.cehrgpt.config.lab_token_penalty
                 and self.cehrgpt.config.lab_token_exists
@@ -1310,28 +1584,60 @@ class CEHRGPT2LMHeadModel(CEHRGPTPreTrainedModel):
                     lab_index,
                     token_loss * self.cehrgpt.config.lab_token_loss_weight,
                     token_loss,
-                ).mean()
+                )
+                token_loss = token_loss.sum() / total_num_tokens
             else:
                 # Flatten the tokens
-                loss_fct = CrossEntropyLoss()
+                loss_fct = CrossEntropyLoss(reduction="none")
                 token_loss = loss_fct(
                     shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
                 )
-            loss = token_loss
+                token_loss = token_loss.sum() / total_num_tokens
+            loss = token_loss * self.cehrgpt.config.next_token_prediction_loss_weight
             if self.cehrgpt.config.entropy_penalty:
                 # Compute probabilities using softmax
-                probs = torch.softmax(lm_logits, dim=-1)
+                probs = torch.softmax(shift_logits, dim=-1)
                 # Compute negative entropy: sum(p * log(p))
                 entropy = torch.sum(
                     probs * torch.log(probs + 1e-9), dim=-1
                 )  # Add epsilon for numerical stability
+                entropy = torch.where(valid_tokens, entropy, 0)
                 # Regularization term: mean entropy scaled by alpha
-                loss += self.cehrgpt.config.entropy_penalty_alpha * entropy.mean()
+                entropy_penalty = entropy.sum() / total_num_tokens
+                loss += entropy_penalty * self.cehrgpt.config.entropy_penalty_alpha
+            if (
+                self.config.include_motor_time_to_event
+                and motor_time_to_event_vectors is not None
+                and motor_event_indicators is not None
+                and motor_time_to_event_to_include is not None
+                and motor_time_indicators is not None
+                and motor_end_index is not None
+            ):
+                ve_token_id_indices = labels == self.config.ve_token_id
+                ve_token_features = hidden_states[ve_token_id_indices]
+                # Get rid of the last VE features because it's already reached the end of the patient sequence and
+                # there is nothing to predict.
+                motor_tte_loss = self.motor_nll_loss(
+                    ve_token_features=ve_token_features,
+                    motor_time_to_event_vectors=motor_time_to_event_vectors,
+                    motor_event_indicators=motor_event_indicators,
+                    motor_time_to_event_to_include=motor_time_to_event_to_include,
+                    motor_time_indicators=motor_time_indicators,
+                    batch_motor_end_index=motor_end_index,
+                )
+                loss += motor_tte_loss * self.config.motor_time_to_event_weight
             # We add another loss term when use_sub_time_tokenization is enabled, we need to recover the sub time token
             # predictions for year/month/token
-            if self.config.use_sub_time_tokenization:
+            if (
+                self.config.use_sub_time_tokenization
+                and sub_time_tokens is not None
+                and time_token_indicators is not None
+            ):
                 # Split the last dimensions into three parts
                 time_loss_fct = CrossEntropyLoss(reduction="none")
                 time_token_logits = self.time_token_lm_head(
@@ -1352,54 +1658,61 @@ class CEHRGPT2LMHeadModel(CEHRGPTPreTrainedModel):
                     ),
                     shifted_time_token_labels.view(-1),
                 )
-                time_token_loss = time_token_loss.view(
-                    -1, 3
-                ) * shifted_time_token_indicators.view(-1, 1).to(hidden_states.dtype)
-                time_token_loss = time_token_loss.sum(-1)
-                time_token_loss = (
-                    torch.mean(time_token_loss) * self.config.time_token_loss_weight
+                time_token_loss = torch.where(
+                    shifted_time_token_indicators.view(-1, 1).to(torch.bool),
+                    time_token_loss.view(-1, 3),
+                    0,
                 )
-                loss += time_token_loss
-        if time_to_visits is not None:
-            # Get lambda and k parameters
-            lambda_param, k_param = self.tte_head(hidden_states)
-            # Perform slicing before tensors are split across GPUs
-            shifted_lambda_param = lambda_param[..., :-1, :].contiguous()
-            shifted_k_param = k_param[..., :-1, :].contiguous()
-            shift_time_to_visits = time_to_visits[..., 1:].contiguous()
-            # Move to the same device as lambda_param
-            shift_time_to_visits = shift_time_to_visits.to(lambda_param.device)
-            time_to_visit_indicator = (shift_time_to_visits >= 0).to(
-                hidden_states.dtype
-            )
-            # Define the Gamma distribution
-            dist = Gamma(shifted_k_param.squeeze(-1), shifted_lambda_param.squeeze(-1))
-            # Compute log-probs and apply the time_to_visit_indicator
-            log_probs = dist.log_prob(torch.clamp(shift_time_to_visits, min=0.0) + 1e-6)
-            log_probs *= time_to_visit_indicator
-            time_to_visit_loss = (
-                -log_probs.mean() * self.config.time_to_visit_loss_weight
-            )
-            # Compute the loss
-            loss += time_to_visit_loss
-        if true_values is not None and true_value_indicators is not None:
-            true_values = true_values.to(value_logits.device)
-            shift_value_logits = value_logits[..., :-1, :].contiguous()
-            shift_value_indicators = true_value_indicators[..., :-1].contiguous()
-            shift_next_values = true_values[..., 1:].contiguous()
-            value_loss_fct = CrossEntropyLoss(reduce=False)
-            token_value_loss = value_loss_fct(
-                shift_value_logits.view(-1, shift_value_logits.size(-1)),
-                shift_next_values.view(-1),
-            )
-            token_value_loss *= shift_value_indicators.view(-1)
-            loss += token_value_loss.mean()
+                time_token_loss = time_token_loss.sum() / total_num_tokens
+                loss += time_token_loss * self.config.time_token_loss_weight
+            if time_to_visits is not None and time_to_visits is not None:
+                # Get lambda and k parameters
+                lambda_param, k_param = self.tte_head(hidden_states)
+                # Perform slicing before tensors are split across GPUs
+                shifted_lambda_param = lambda_param[..., :-1, :].contiguous()
+                shifted_k_param = k_param[..., :-1, :].contiguous()
+                shift_time_to_visits = time_to_visits[..., 1:].contiguous()
+                # Move to the same device as lambda_param
+                shift_time_to_visits = shift_time_to_visits.to(lambda_param.device)
+                time_to_visit_indicator = shift_time_to_visits >= 0
+                # Define the Gamma distribution
+                dist = Gamma(
+                    shifted_k_param.squeeze(-1), shifted_lambda_param.squeeze(-1)
+                )
+                # Compute log-probs and apply the time_to_visit_indicator
+                log_probs = dist.log_prob(
+                    torch.clamp(shift_time_to_visits, min=1e-3) + 1e-6
+                )
+                log_probs = torch.where(time_to_visit_indicator, log_probs, 0)
+                time_to_visit_loss = -log_probs.sum() / total_num_tokens
+                # Compute the loss
+                loss += time_to_visit_loss * self.config.time_to_visit_loss_weight
+            if true_values is not None and true_value_indicators is not None:
+                true_values = true_values.to(value_logits.device)
+                shift_value_logits = value_logits[..., :-1, :].contiguous()
+                shift_value_indicators = true_value_indicators[..., :-1].contiguous()
+                shift_next_values = true_values[..., 1:].contiguous()
+                value_loss_fct = CrossEntropyLoss(reduction="none")
+                token_value_loss = value_loss_fct(
+                    shift_value_logits.view(-1, shift_value_logits.size(-1)),
+                    shift_next_values.view(-1),
+                )
+                token_value_loss = torch.where(
+                    shift_value_indicators.view(-1), token_value_loss, 0
+                )
+                token_value_loss = token_value_loss.sum() / total_num_tokens
+                if (
+                    self.cehrgpt.config.lab_token_penalty
+                    and self.cehrgpt.config.lab_token_exists
+                ):
+                    token_value_loss = (
+                        token_value_loss * self.config.lab_token_loss_weight
+                    )
+                loss += token_value_loss * self.config.value_prediction_loss_weight
         if not return_dict:
             output = (lm_logits,) + transformer_outputs[1:]
@@ -1417,6 +1730,7 @@ class CEHRGPT2LMHeadModel(CEHRGPTPreTrainedModel):
             time_token_loss=time_token_loss,
             time_to_visit_loss=time_to_visit_loss,
             token_value_loss=token_value_loss,
+            motor_tte_loss=motor_tte_loss,
         )
     @staticmethod
@@ -1690,6 +2004,27 @@ class CEHRGPT2LMHeadModel(CEHRGPTPreTrainedModel):
         )
+class FocalLoss(nn.Module):
+    def __init__(self, alpha=0.25, gamma=2.0, reduction="mean"):
+        super().__init__()
+        self.alpha = alpha
+        self.gamma = gamma
+        self.reduction = reduction
+    def forward(self, logits, targets):
+        bce_loss = F.binary_cross_entropy_with_logits(logits, targets, reduction="none")
+        probs = torch.sigmoid(logits)
+        pt = torch.where(targets == 1, probs, 1 - probs)
+        focal_term = (1 - pt) ** self.gamma
+        loss = self.alpha * focal_term * bce_loss
+        if self.reduction == "mean":
+            return loss.mean()
+        elif self.reduction == "sum":
+            return loss.sum()
+        return loss
 class CehrGptForClassification(CEHRGPTPreTrainedModel):
     _keep_in_fp32_modules = ["age_batch_norm", "dense_layer", "classifier"]
@@ -1712,7 +2047,6 @@ class CehrGptForClassification(CEHRGPTPreTrainedModel):
         self.model_parallel = False
         self.device_map = None
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
         self.post_init()
@@ -1768,6 +2102,7 @@ class CehrGptForClassification(CEHRGPTPreTrainedModel):
         return_dict: Optional[bool] = None,
         **kwargs,
     ) -> CehrGptSequenceClassifierOutput:
         cehrgpt_output = self.cehrgpt(
             input_ids=input_ids,
             value_indicators=value_indicators,
@@ -1782,17 +2117,39 @@ class CehrGptForClassification(CEHRGPTPreTrainedModel):
             return_dict=return_dict,
         )
-        # Disable autocasting for precision-sensitive operations
-        with torch.autocast(device_type="cuda", enabled=False):
-            normalized_age = self._apply_age_norm(age_at_index)
+        if is_sample_pack(attention_mask):
+            features = extract_features_from_packed_sequence(
+                cehrgpt_output.last_hidden_state, attention_mask
+            )
+            assert features.shape[1] == classifier_label.shape[1], (
+                "the length of the features need to be the same as the length of classifier_label. "
+                f"features.shape[1]: {features.shape[1]}, "
+                f"classifier_label.shape[1]: {classifier_label.shape[1]}"
+            )
+            assert features.shape[1] == age_at_index.shape[1], (
+                "the length of the features need to be the same as the length of age_at_index. "
+                f"features.shape[1]: {features.shape[1]}, "
+                f"age_at_index.shape[1]: {age_at_index.shape[1]}"
+            )
+            num_samples = age_at_index.shape[1]
+            features = features.view((num_samples, -1))
+            classifier_label = classifier_label.view((num_samples, -1))
+            with torch.autocast(device_type="cuda", enabled=False):
+                normalized_age = self._apply_age_norm(
+                    age_at_index.view((num_samples, 1))
+                )
+        else:
+            features = cehrgpt_output.last_hidden_state[..., -1, :]
+            # Disable autocasting for precision-sensitive operations
+            with torch.autocast(device_type="cuda", enabled=False):
+                normalized_age = self._apply_age_norm(age_at_index)
         # In case the model is in bfloat16
-        if cehrgpt_output.last_hidden_state.dtype != normalized_age.dtype:
-            normalized_age = normalized_age.to(cehrgpt_output.last_hidden_state.dtype)
+        if features.dtype != normalized_age.dtype:
+            normalized_age = normalized_age.to(features.dtype)
         # In fine-tuning, the sequences are left-padded, so we use the last element as the pooler
-        output_pooler = cehrgpt_output.last_hidden_state[..., -1, :]
-        next_input = self.dropout(output_pooler)
+        next_input = self.dropout(features)
         next_input = torch.cat([next_input, normalized_age], dim=1)
         next_input = self.dense_layer(next_input)
         next_input = nn.functional.relu(next_input)
@@ -1801,7 +2158,14 @@ class CehrGptForClassification(CEHRGPTPreTrainedModel):
         loss = None
         if classifier_label is not None:
-            loss_fct = nn.BCEWithLogitsLoss()
+            if self.config.class_weights:
+                class_weights = torch.tensor(
+                    [self.config.class_weights[1] / self.config.class_weights[0]],
+                    dtype=torch.float32,
+                ).to(logits.device)
+            else:
+                class_weights = None
+            loss_fct = nn.BCEWithLogitsLoss(pos_weight=class_weights)
             loss = loss_fct(logits, classifier_label)
         return CehrGptSequenceClassifierOutput(

cehrgpt 0.0.2__py3-none-any.whl → 0.1.1__py3-none-any.whl

cehrgpt 0.0.2py3-none-any.whl → 0.1.1py3-none-any.whl