PyPI - cehrgpt - Versions diffs - 0.0.2__py3-none-any.whl → 0.1.0__py3-none-any.whl - Mend

cehrgpt 0.0.2py3-none-any.whl → 0.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

cehrgpt/data/hf_cehrgpt_dataset.py +24 -4
cehrgpt/data/hf_cehrgpt_dataset_collator.py +260 -84
cehrgpt/data/hf_cehrgpt_dataset_mapping.py +99 -88
cehrgpt/data/sample_packing_sampler.py +151 -0
cehrgpt/generation/generate_batch_hf_gpt_sequence.py +12 -9
cehrgpt/models/config.py +10 -0
cehrgpt/models/hf_cehrgpt.py +243 -73
cehrgpt/models/tokenization_hf_cehrgpt.py +4 -0
cehrgpt/runners/data_utils.py +243 -0
cehrgpt/runners/gpt_runner_util.py +0 -10
cehrgpt/runners/hf_cehrgpt_finetune_runner.py +152 -279
cehrgpt/runners/hf_cehrgpt_pretrain_runner.py +229 -105
cehrgpt/runners/hf_gpt_runner_argument_dataclass.py +42 -0
cehrgpt/runners/hyperparameter_search_util.py +4 -1
cehrgpt/runners/sample_packing_trainer.py +168 -0
cehrgpt/simulations/generate_plots.py +95 -0
cehrgpt/simulations/run_simulation.sh +24 -0
cehrgpt/simulations/time_embedding_simulation.py +250 -0
cehrgpt/simulations/time_token_simulation.py +177 -0
cehrgpt/tools/linear_prob/__init__.py +0 -0
cehrgpt/tools/linear_prob/compute_cehrgpt_features.py +467 -0
cehrgpt/tools/linear_prob/train_with_cehrgpt_features.py +152 -0
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.0.dist-info}/METADATA +7 -5
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.0.dist-info}/RECORD +28 -26
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.0.dist-info}/WHEEL +1 -1
cehrgpt/data/hf_cehrgpt_dpo_collator.py +0 -71
cehrgpt/data/hf_cehrgpt_dpo_dataset_mapping.py +0 -61
cehrgpt/generation/generate_paired_cehrgpt_sequence.py +0 -224
cehrgpt/rl_finetune/cehrgpt_dpo_trainer.py +0 -586
cehrgpt/rl_finetune/cehrgpt_ppo_trainer.py +0 -464
cehrgpt/rl_finetune/ppo_finetune.py +0 -394
cehrgpt/rl_finetune/ppo_finetune_v2.py +0 -373
cehrgpt/runners/hf_cehrgpt_dpo_runner.py +0 -119
/cehrgpt/{rl_finetune → simulations}/__init__.py +0 -0
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.0.dist-info/licenses}/LICENSE +0 -0
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.0.dist-info}/top_level.txt +0 -0

cehrgpt/models/hf_cehrgpt.py CHANGED Viewed

@@ -6,7 +6,7 @@ import numpy as np
 import torch
 import torch.nn.functional as f
 from torch import nn
-from torch.distributions import Gamma
+from torch.distributions import Gamma, Weibull
 from torch.nn import CrossEntropyLoss
 from torch.nn import functional as F
 from transformers import PreTrainedModel
@@ -45,12 +45,108 @@ if is_accelerate_available():
 logger = logging.get_logger(__name__)
+def extract_features_from_packed_sequence(
+    hidden_state: torch.Tensor,
+    attention_mask: torch.Tensor,
+) -> torch.Tensor:
+    max_index = attention_mask.nonzero(as_tuple=False).flatten()[-1]
+    padded_attention_mask = F.pad(attention_mask[:, : max_index + 1], (0, 1))
+    feature_indices = torch.nonzero(padded_attention_mask == 0)[:, 1] - 1
+    return hidden_state[:, feature_indices]
+def create_sample_packing_attention_mask(attention_mask: torch.Tensor) -> torch.Tensor:
+    """
+    Create a block-diagonal attention mask for packed sequences within a batch.
+    Args:
+        attention_mask (torch.Tensor): (batch_size, seq_len) binary mask where 1 = token, 0 = padding
+    Returns:
+        torch.Tensor: (batch_size, seq_len, seq_len) attention mask where entries are 1 if tokens
+                      can attend to each other (within same packed segment), 0 otherwise.
+    """
+    # Step 1: Identify segments within each sample
+    cumsum_mask = (attention_mask == 0).cumsum(dim=-1)
+    segment_ids = cumsum_mask * attention_mask  # zeros remain zero
+    # Step 2: Compare segment IDs pairwise per batch element
+    # Shape: (batch_size, seq_len, seq_len)
+    attn_matrix = (segment_ids.unsqueeze(2) == segment_ids.unsqueeze(1)).int()
+    # Step 3: Mask out padding tokens
+    mask = attention_mask.unsqueeze(1) * attention_mask.unsqueeze(2)
+    attn_matrix = attn_matrix * mask
+    return attn_matrix
+def is_sample_pack(attention_mask: torch.Tensor) -> bool:
+    """
+    Determines whether any sequence in the batch is likely sample-packed.
+    A sample-packed sequence is one where there are non-padding (1) tokens
+    after a padding (0) token, indicating multiple sequences packed together
+    with padding as a separator.
+    Args:
+        attention_mask (torch.Tensor): A tensor of shape (batch_size, seq_len)
+            where 1 indicates a real token and 0 indicates padding.
+    Returns:
+        bool: True if any sample in the batch is sample-packed, False otherwise.
+    """
+    # If the attention_maks is left padded, we will flip it so we can use the same logic below
+    if (attention_mask[:, 0] == 0).any():
+        attention_mask = attention_mask.flip(dims=[1])
+    nonzero_counts = attention_mask.sum(dim=1)
+    max_token_positions = torch.argmax(attention_mask.flip(dims=[1]), dim=1)
+    max_indices = attention_mask.shape[1] - 1 - max_token_positions
+    return torch.any(nonzero_counts < (max_indices + 1)).item()
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
 def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    # This infers sample packing
+    if is_sample_pack(attention_mask):
+        # Assume input: attention_mask shape = (batch, seq_len)
+        attention_mask = attention_mask.flatten()  # shape: (seq_len,)
+        # Compute max_index of the last non-zero element
+        nonzero = torch.nonzero(attention_mask, as_tuple=False).flatten()
+        max_index = nonzero[-1].item()
+        # Pad the truncated attention mask
+        padded_attention_mask = F.pad(attention_mask[: max_index + 1], (0, 1), value=0)
+        # Indices of all tokens
+        indices = torch.nonzero(attention_mask, as_tuple=False).flatten()
+        # Find where 0s occur (segment boundaries)
+        cumsum_seqlens_in_batch = torch.cumsum(padded_attention_mask, dim=0)[
+            padded_attention_mask == 0
+        ]
+        # Compute seqlens per segment
+        seqlens_in_batch = (
+            cumsum_seqlens_in_batch
+            - F.pad(cumsum_seqlens_in_batch, (1, 0), value=0)[:-1]
+        ).to(torch.int)
+        max_seqlen_in_batch = (
+            seqlens_in_batch.max().item() if seqlens_in_batch.numel() > 0 else 0
+        )
+        cu_seqlens = F.pad(cumsum_seqlens_in_batch, (1, 0)).to(torch.int)
+    else:
+        seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+        indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+        max_seqlen_in_batch = seqlens_in_batch.max().item()
+        cu_seqlens = F.pad(
+            torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)
+        )
     return (
         indices,
         cu_seqlens,
@@ -609,7 +705,8 @@ class CEHRGPT2Model(CEHRGPTPreTrainedModel):
             self.pretrained_wte = None
         self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
-        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
+        if not self.exclude_position_ids:
+            self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
         if self.include_values:
             self.vte = nn.Embedding(config.value_vocab_size, self.embed_dim)
             self.concept_value_transformation_layer = ConceptValueTransformationLayer(
@@ -635,6 +732,14 @@ class CEHRGPT2Model(CEHRGPTPreTrainedModel):
         # Initialize weights and apply final processing
         self.post_init()
+        # We do need to update the pre-computed attention bias matrix if sample packing requires a larger context window
+        if self.config.sample_packing_max_positions > self.config.n_positions:
+            logger.info(
+                "Updated attn_bias to %s according to sample_packing_max_positions",
+                config.sample_packing_max_positions,
+            )
+            self.update_attn_bias(self.config.sample_packing_max_positions)
     def initialize_pretrained_embeddings(self):
         layers = [
             nn.Embedding(self.config.vocab_size, self.config.pretrained_embedding_dim),
@@ -677,7 +782,8 @@ class CEHRGPT2Model(CEHRGPTPreTrainedModel):
         self.wte = self.wte.to(self.first_device)
         if self.config.use_pretrained_embeddings:
             self.pretrained_wte = self.pretrained_wte.to(self.first_device)
-        self.wpe = self.wpe.to(self.first_device)
+        if not self.exclude_position_ids:
+            self.wpe = self.wpe.to(self.first_device)
         if self.include_values:
             self.vte = self.vte.to(self.first_device)
             self.concept_value_transformation_layer = (
@@ -703,7 +809,8 @@ class CEHRGPT2Model(CEHRGPTPreTrainedModel):
         self.wte = self.wte.to("cpu")
         if self.config.use_pretrained_embeddings:
             self.pretrained_wte = self.pretrained_wte.to("cpu")
-        self.wpe = self.wpe.to("cpu")
+        if not self.exclude_position_ids:
+            self.wpe = self.wpe.to("cpu")
         self.vte = self.vte.to("cpu")
         self.concept_value_transformation_layer = (
             self.concept_value_transformation_layer.to("cpu")
@@ -728,8 +835,12 @@ class CEHRGPT2Model(CEHRGPTPreTrainedModel):
                 persistent=False,
             )
-    def get_position_embeddings(self) -> Union[nn.Embedding, Tuple[nn.Embedding]]:
-        return self.wpe
+    def get_position_embeddings(
+        self,
+    ) -> Optional[Union[nn.Embedding, Tuple[nn.Embedding]]]:
+        if not self.exclude_position_ids:
+            return self.wpe
+        return None
     def set_position_embeddings(self, new_embeddings: nn.Embedding):
         self.wpe = new_embeddings
@@ -758,8 +869,8 @@ class CEHRGPT2Model(CEHRGPTPreTrainedModel):
     def forward(
         self,
         input_ids: Optional[torch.LongTensor],
-        value_indicators: Optional[torch.BoolTensor],
-        values: Optional[torch.LongTensor],
+        value_indicators: Optional[torch.BoolTensor] = None,
+        values: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
@@ -850,12 +961,19 @@ class CEHRGPT2Model(CEHRGPTPreTrainedModel):
                 == "flash_attention_2"
             ):
                 attention_mask = attention_mask.view(batch_size, -1)
-                # We create a 3D attention mask from a 2D tensor mask.
-                # Sizes are [batch_size, 1, 1, to_seq_length]
-                # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-                # this attention mask is more simple than the triangular masking of causal attention
-                # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-                attention_mask = attention_mask[:, None, None, :]
+                # If this is sample packing, we need to great the
+                if is_sample_pack(attention_mask):
+                    attention_mask = create_sample_packing_attention_mask(
+                        attention_mask
+                    )[:, None, :, :]
+                else:
+                    # We create a 3D attention mask from a 2D tensor mask.
+                    # Sizes are [batch_size, 1, 1, to_seq_length]
+                    # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+                    # this attention mask is more simple than the triangular masking of causal attention
+                    # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+                    attention_mask = attention_mask[:, None, None, :]
                 # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
                 # masked positions, this operation will create a tensor which is 0.0 for
@@ -1288,9 +1406,26 @@ class CEHRGPT2LMHeadModel(CEHRGPTPreTrainedModel):
         if labels is not None:
             # move labels to correct device to enable model parallelism
             labels = labels.to(lm_logits.device)
+            if self.config.causal_sfm:
+                # Ensure demographic_labels matches the dtype of original labels
+                demographic_labels = torch.full(
+                    (labels.shape[0], self.config.demographics_size),
+                    -100,
+                    dtype=labels.dtype,  # Match the original labels' dtype
+                    device=labels.device,  # Ensure on the same device
+                )
+                # Concatenate the demographic labels with the rest of the original labels
+                labels = torch.cat(
+                    (demographic_labels, labels[:, self.config.demographics_size :]),
+                    dim=1,
+                )
             # Shift so that tokens < n predict n
             shift_logits = lm_logits[..., :-1, :].contiguous()
             shift_labels = labels[..., 1:].contiguous()
+            valid_tokens: torch.BoolTensor = shift_labels != 100
+            total_num_tokens = valid_tokens.sum()
             if (
                 self.cehrgpt.config.lab_token_penalty
                 and self.cehrgpt.config.lab_token_exists
@@ -1310,24 +1445,30 @@ class CEHRGPT2LMHeadModel(CEHRGPTPreTrainedModel):
                     lab_index,
                     token_loss * self.cehrgpt.config.lab_token_loss_weight,
                     token_loss,
-                ).mean()
+                )
+                token_loss = token_loss.sum() / total_num_tokens
             else:
                 # Flatten the tokens
-                loss_fct = CrossEntropyLoss()
+                loss_fct = CrossEntropyLoss(reduction="none")
                 token_loss = loss_fct(
                     shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
                 )
-            loss = token_loss
+                token_loss = token_loss.sum() / total_num_tokens
+            loss = token_loss * self.cehrgpt.config.next_token_prediction_loss_weight
             if self.cehrgpt.config.entropy_penalty:
                 # Compute probabilities using softmax
-                probs = torch.softmax(lm_logits, dim=-1)
+                probs = torch.softmax(shift_logits, dim=-1)
                 # Compute negative entropy: sum(p * log(p))
                 entropy = torch.sum(
                     probs * torch.log(probs + 1e-9), dim=-1
                 )  # Add epsilon for numerical stability
+                entropy = torch.where(valid_tokens, entropy, 0)
                 # Regularization term: mean entropy scaled by alpha
-                loss += self.cehrgpt.config.entropy_penalty_alpha * entropy.mean()
+                entropy_penalty = entropy.sum() / total_num_tokens
+                loss += entropy_penalty * self.cehrgpt.config.entropy_penalty_alpha
             # We add another loss term when use_sub_time_tokenization is enabled, we need to recover the sub time token
             # predictions for year/month/token
@@ -1352,54 +1493,60 @@ class CEHRGPT2LMHeadModel(CEHRGPTPreTrainedModel):
                     ),
                     shifted_time_token_labels.view(-1),
                 )
-                time_token_loss = time_token_loss.view(
-                    -1, 3
-                ) * shifted_time_token_indicators.view(-1, 1).to(hidden_states.dtype)
-                time_token_loss = time_token_loss.sum(-1)
-                time_token_loss = (
-                    torch.mean(time_token_loss) * self.config.time_token_loss_weight
+                time_token_loss = torch.where(
+                    shifted_time_token_indicators.view(-1, 1).to(torch.bool),
+                    time_token_loss.view(-1, 3),
+                    0,
                 )
-                loss += time_token_loss
+                time_token_loss = time_token_loss.sum() / total_num_tokens
+                loss += time_token_loss * self.config.time_token_loss_weight
-        if time_to_visits is not None:
-            # Get lambda and k parameters
-            lambda_param, k_param = self.tte_head(hidden_states)
+            if time_to_visits is not None:
+                # Get lambda and k parameters
+                lambda_param, k_param = self.tte_head(hidden_states)
-            # Perform slicing before tensors are split across GPUs
-            shifted_lambda_param = lambda_param[..., :-1, :].contiguous()
-            shifted_k_param = k_param[..., :-1, :].contiguous()
-            shift_time_to_visits = time_to_visits[..., 1:].contiguous()
+                # Perform slicing before tensors are split across GPUs
+                shifted_lambda_param = lambda_param[..., :-1, :].contiguous()
+                shifted_k_param = k_param[..., :-1, :].contiguous()
+                shift_time_to_visits = time_to_visits[..., 1:].contiguous()
-            # Move to the same device as lambda_param
-            shift_time_to_visits = shift_time_to_visits.to(lambda_param.device)
+                # Move to the same device as lambda_param
+                shift_time_to_visits = shift_time_to_visits.to(lambda_param.device)
-            time_to_visit_indicator = (shift_time_to_visits >= 0).to(
-                hidden_states.dtype
-            )
-            # Define the Gamma distribution
-            dist = Gamma(shifted_k_param.squeeze(-1), shifted_lambda_param.squeeze(-1))
-            # Compute log-probs and apply the time_to_visit_indicator
-            log_probs = dist.log_prob(torch.clamp(shift_time_to_visits, min=0.0) + 1e-6)
-            log_probs *= time_to_visit_indicator
-            time_to_visit_loss = (
-                -log_probs.mean() * self.config.time_to_visit_loss_weight
-            )
-            # Compute the loss
-            loss += time_to_visit_loss
-        if true_values is not None and true_value_indicators is not None:
-            true_values = true_values.to(value_logits.device)
-            shift_value_logits = value_logits[..., :-1, :].contiguous()
-            shift_value_indicators = true_value_indicators[..., :-1].contiguous()
-            shift_next_values = true_values[..., 1:].contiguous()
-            value_loss_fct = CrossEntropyLoss(reduce=False)
-            token_value_loss = value_loss_fct(
-                shift_value_logits.view(-1, shift_value_logits.size(-1)),
-                shift_next_values.view(-1),
-            )
-            token_value_loss *= shift_value_indicators.view(-1)
-            loss += token_value_loss.mean()
+                time_to_visit_indicator = shift_time_to_visits >= 0
+                # Define the Gamma distribution
+                dist = Gamma(
+                    shifted_k_param.squeeze(-1), shifted_lambda_param.squeeze(-1)
+                )
+                # Compute log-probs and apply the time_to_visit_indicator
+                log_probs = dist.log_prob(torch.clamp(shift_time_to_visits, min=1e-3))
+                log_probs = torch.where(time_to_visit_indicator, log_probs, 0)
+                time_to_visit_loss = -log_probs.sum() / total_num_tokens
+                # Compute the loss
+                loss += time_to_visit_loss * self.config.time_to_visit_loss_weight
+            if true_values is not None and true_value_indicators is not None:
+                true_values = true_values.to(value_logits.device)
+                shift_value_logits = value_logits[..., :-1, :].contiguous()
+                shift_value_indicators = true_value_indicators[..., :-1].contiguous()
+                shift_next_values = true_values[..., 1:].contiguous()
+                value_loss_fct = CrossEntropyLoss(reduction="none")
+                token_value_loss = value_loss_fct(
+                    shift_value_logits.view(-1, shift_value_logits.size(-1)),
+                    shift_next_values.view(-1),
+                )
+                token_value_loss = torch.where(
+                    shift_value_indicators.view(-1), token_value_loss, 0
+                )
+                token_value_loss = token_value_loss.sum() / total_num_tokens
+                if (
+                    self.cehrgpt.config.lab_token_penalty
+                    and self.cehrgpt.config.lab_token_exists
+                ):
+                    token_value_loss = (
+                        token_value_loss * self.config.lab_token_loss_weight
+                    )
+                loss += token_value_loss * self.config.value_prediction_loss_weight
         if not return_dict:
             output = (lm_logits,) + transformer_outputs[1:]
@@ -1768,6 +1915,7 @@ class CehrGptForClassification(CEHRGPTPreTrainedModel):
         return_dict: Optional[bool] = None,
         **kwargs,
     ) -> CehrGptSequenceClassifierOutput:
         cehrgpt_output = self.cehrgpt(
             input_ids=input_ids,
             value_indicators=value_indicators,
@@ -1782,17 +1930,39 @@ class CehrGptForClassification(CEHRGPTPreTrainedModel):
             return_dict=return_dict,
         )
-        # Disable autocasting for precision-sensitive operations
-        with torch.autocast(device_type="cuda", enabled=False):
-            normalized_age = self._apply_age_norm(age_at_index)
+        if is_sample_pack(attention_mask):
+            features = extract_features_from_packed_sequence(
+                cehrgpt_output.last_hidden_state, attention_mask
+            )
+            assert features.shape[1] == classifier_label.shape[1], (
+                "the length of the features need to be the same as the length of classifier_label. "
+                f"features.shape[1]: {features.shape[1]}, "
+                f"classifier_label.shape[1]: {classifier_label.shape[1]}"
+            )
+            assert features.shape[1] == age_at_index.shape[1], (
+                "the length of the features need to be the same as the length of age_at_index. "
+                f"features.shape[1]: {features.shape[1]}, "
+                f"age_at_index.shape[1]: {age_at_index.shape[1]}"
+            )
+            num_samples = age_at_index.shape[1]
+            features = features.view((num_samples, -1))
+            classifier_label = classifier_label.view((num_samples, -1))
+            with torch.autocast(device_type="cuda", enabled=False):
+                normalized_age = self._apply_age_norm(
+                    age_at_index.view((num_samples, 1))
+                )
+        else:
+            features = cehrgpt_output.last_hidden_state[..., -1, :]
+            # Disable autocasting for precision-sensitive operations
+            with torch.autocast(device_type="cuda", enabled=False):
+                normalized_age = self._apply_age_norm(age_at_index)
         # In case the model is in bfloat16
-        if cehrgpt_output.last_hidden_state.dtype != normalized_age.dtype:
-            normalized_age = normalized_age.to(cehrgpt_output.last_hidden_state.dtype)
+        if features.dtype != normalized_age.dtype:
+            normalized_age = normalized_age.to(features.dtype)
         # In fine-tuning, the sequences are left-padded, so we use the last element as the pooler
-        output_pooler = cehrgpt_output.last_hidden_state[..., -1, :]
-        next_input = self.dropout(output_pooler)
+        next_input = self.dropout(features)
         next_input = torch.cat([next_input, normalized_age], dim=1)
         next_input = self.dense_layer(next_input)
         next_input = nn.functional.relu(next_input)

cehrgpt/models/tokenization_hf_cehrgpt.py CHANGED Viewed

@@ -25,6 +25,7 @@ from tokenizers.pre_tokenizers import WhitespaceSplit
 from tokenizers.trainers import WordLevelTrainer
 from tqdm import tqdm
 from transformers import PreTrainedTokenizer
+from transformers.utils import logging
 from cehrgpt.gpt_utils import (
     convert_time_interval_to_time_tuple,
@@ -53,6 +54,7 @@ TOKEN_TO_SUB_TIME_TOKEN_MAPPING_FILE_NAME = "token_to_sub_time_token_mapping.jso
 LAB_STATS_FILE_NAME = "cehrgpt_lab_stats.pickle"
 LEGACY_LAB_STATS_FILE_NAME = "cehrgpt_lab_stats.json"
 CONCEPT_MAPPING_FILE_NAME = "concept_name_mapping.json"
+LOG = logging.get_logger("transformers")
 def truncated_sample(sample, standard_deviation):
@@ -888,6 +890,7 @@ class CehrGptTokenizer(PreTrainedTokenizer):
         if isinstance(dataset, DatasetDict):
             dataset = dataset["train"]
+        LOG.info("Training the tokenizer for concepts")
         concept_tokenizer = cls.train_concept_tokenizer(
             dataset,
             feature_name="concept_ids",
@@ -900,6 +903,7 @@ class CehrGptTokenizer(PreTrainedTokenizer):
             if concept_value_column not in row:
                 concept_value_column = "concept_values"
             break
+        LOG.info("Training the tokenizer for values")
         value_tokenizer = cls.train_concept_tokenizer(
             dataset,
             feature_name=concept_value_column,

cehrgpt 0.0.2__py3-none-any.whl → 0.1.0__py3-none-any.whl

cehrgpt 0.0.2py3-none-any.whl → 0.1.0py3-none-any.whl