PyPI - cehrgpt - Versions diffs - 0.0.2__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

cehrgpt 0.0.2py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

cehrgpt/analysis/irregularity.py +36 -0
cehrgpt/data/hf_cehrgpt_dataset.py +25 -4
cehrgpt/data/hf_cehrgpt_dataset_collator.py +635 -97
cehrgpt/data/hf_cehrgpt_dataset_mapping.py +308 -95
cehrgpt/data/sample_packing_sampler.py +181 -0
cehrgpt/generation/generate_batch_hf_gpt_sequence.py +12 -9
cehrgpt/generation/omop_converter_batch.py +32 -2
cehrgpt/gpt_utils.py +20 -2
cehrgpt/models/config.py +35 -0
cehrgpt/models/hf_cehrgpt.py +470 -106
cehrgpt/models/hf_modeling_outputs.py +1 -0
cehrgpt/models/special_tokens.py +1 -0
cehrgpt/models/tokenization_hf_cehrgpt.py +358 -71
cehrgpt/runners/data_utils.py +358 -0
cehrgpt/runners/gpt_runner_util.py +0 -10
cehrgpt/runners/hf_cehrgpt_finetune_runner.py +181 -283
cehrgpt/runners/hf_cehrgpt_pretrain_runner.py +288 -112
cehrgpt/runners/hf_gpt_runner_argument_dataclass.py +90 -0
cehrgpt/runners/hyperparameter_search_util.py +10 -8
cehrgpt/runners/sample_packing_trainer.py +185 -0
cehrgpt/simulations/generate_plots.py +95 -0
cehrgpt/simulations/run_simulation.sh +24 -0
cehrgpt/simulations/time_embedding_simulation.py +250 -0
cehrgpt/simulations/time_token_simulation.py +177 -0
cehrgpt/time_to_event/config/1_year_cabg.yaml +23 -0
cehrgpt/time_to_event/time_to_event_model.py +2 -13
cehrgpt/time_to_event/time_to_event_prediction.py +27 -13
cehrgpt/tools/linear_prob/__init__.py +0 -0
cehrgpt/tools/linear_prob/compute_cehrgpt_features.py +495 -0
cehrgpt/tools/linear_prob/train_with_cehrgpt_features.py +152 -0
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info}/METADATA +11 -8
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info}/RECORD +36 -32
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info}/WHEEL +1 -1
cehrgpt/data/hf_cehrgpt_dpo_collator.py +0 -71
cehrgpt/data/hf_cehrgpt_dpo_dataset_mapping.py +0 -61
cehrgpt/generation/generate_paired_cehrgpt_sequence.py +0 -224
cehrgpt/rl_finetune/cehrgpt_dpo_trainer.py +0 -586
cehrgpt/rl_finetune/cehrgpt_ppo_trainer.py +0 -464
cehrgpt/rl_finetune/ppo_finetune.py +0 -394
cehrgpt/rl_finetune/ppo_finetune_v2.py +0 -373
cehrgpt/runners/hf_cehrgpt_dpo_runner.py +0 -119
/cehrgpt/{rl_finetune → simulations}/__init__.py +0 -0
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info/licenses}/LICENSE +0 -0
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info}/top_level.txt +0 -0

cehrgpt/data/sample_packing_sampler.py ADDED Viewed

@@ -0,0 +1,181 @@
+from typing import Iterator, List, Optional
+import numpy as np
+import torch
+import torch.distributed as dist
+from torch.utils.data import Sampler
+from transformers import logging
+LOG = logging.get_logger("transformers")
+class SamplePlacerHolder:
+    def __init__(self):
+        self.epoch = 0
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+class SamplePackingBatchSampler(Sampler[List[int]]):
+    """
+    A batch sampler that creates batches by packing samples together.
+    to maximize GPU utilization, ensuring the total tokens per batch
+    doesn't exceed max_tokens.
+    """
+    def __init__(
+        self,
+        lengths: List[int],
+        max_tokens_per_batch: int,
+        max_position_embeddings: int,
+        num_replicas: Optional[int] = None,
+        rank: Optional[int] = None,
+        seed: int = 0,
+        drop_last: bool = False,
+        negative_sampling_probability: Optional[float] = None,
+        labels: Optional[List[int]] = None,
+    ):
+        """
+        Args:
+            lengths: List of sequence lengths for each sample
+            max_tokens: Maximum number of tokens in a batch
+            drop_last: Whether to drop the last incomplete batch
+        """
+        super().__init__()
+        if num_replicas is None:
+            if dist.is_available() and dist.is_initialized():
+                num_replicas = dist.get_world_size()
+                LOG.info(
+                    "torch.distributed is initialized and there are %s of replicas",
+                    num_replicas,
+                )
+            else:
+                num_replicas = 1
+                LOG.info(
+                    "torch.dist is not initialized and therefore default to 1 for num_replicas"
+                )
+        if rank is None:
+            if dist.is_available() and dist.is_initialized():
+                rank = dist.get_rank()
+                LOG.info(
+                    "torch.distributed is initialized and the current rank is %s", rank
+                )
+            else:
+                rank = 0
+                LOG.info(
+                    "torch.distributed is not initialized and therefore default to 0 for rank"
+                )
+        if not (0 <= rank < num_replicas):
+            raise ValueError(
+                f"Invalid rank {rank}, rank should be in the interval [0, {num_replicas - 1}]"
+            )
+        if negative_sampling_probability is not None and labels is None:
+            raise ValueError(
+                f"When the negative sampling probability is provide, the labels must be provided as well"
+            )
+        self.lengths = lengths
+        self.max_tokens_per_batch = max_tokens_per_batch
+        self.max_position_embeddings = max_position_embeddings
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.seed = seed
+        self.drop_last = drop_last
+        self.negative_sampling_probability = negative_sampling_probability
+        self.labels = labels
+        # Trainer https://github.com/huggingface/transformers/blame/main/src/transformers/trainer.py#L2470
+        # http://github.com/huggingface/accelerate/blob/v0.31.0/src/accelerate/data_loader.py#L482
+        # the huggingface trainer will call the accelerate.data_loader.DataLoaderShard.set_epoch,
+        # which will call batch_sampler.sample.set_epoch
+        self.sampler = SamplePlacerHolder()
+    def __iter__(self) -> Iterator[List[int]]:
+        # deterministically shuffle based on epoch and seed
+        g = torch.Generator()
+        g.manual_seed(self.seed + self.sampler.epoch)
+        indices = torch.randperm(len(self.lengths), generator=g).tolist()
+        # Partition indices for this rank
+        indices = indices[self.rank :: self.num_replicas]
+        batch = []
+        current_batch_tokens = 0
+        for idx in indices:
+            # There is a chance to skip the negative samples to account for the class imbalance
+            # in the fine-tuning dataset
+            if self.negative_sampling_probability:
+                if (
+                    np.random.random() > self.negative_sampling_probability
+                    and self.labels[idx] == 0
+                ):
+                    continue
+            # We take the minimum of the two because each sequence will be truncated to fit
+            # the context window of the model
+            sample_length = min(self.lengths[idx], self.max_position_embeddings)
+            # If adding this sample would exceed max_tokens_per_batch, yield the current batch
+            if (
+                current_batch_tokens + sample_length + 2 > self.max_tokens_per_batch
+                and batch
+            ):
+                yield batch
+                batch = []
+                current_batch_tokens = 0
+            # Add the sample to the current batch
+            batch.append(idx)
+            # plus extract one for the [END] and [PAD] tokens to separate samples
+            current_batch_tokens += sample_length + 2
+        # Yield the last batch if it's not empty and we're not dropping it
+        if batch and not self.drop_last:
+            yield batch
+    def __len__(self) -> int:
+        """
+        Estimates the number of batches that will be generated.
+        This is an approximation since the exact number depends on the specific
+        sequence lengths and their order.
+        """
+        if len(self.lengths) == 0:
+            return 0
+        # There is a chance to skip the negative samples to account for the class imbalance
+        # in the fine-tuning dataset
+        if self.negative_sampling_probability:
+            truncated_lengths = []
+            for length, label in zip(self.lengths, self.labels):
+                if (
+                    np.random.random() > self.negative_sampling_probability
+                    and label == 0
+                ):
+                    continue
+                truncated_lengths.append(length)
+        else:
+            # We need to truncate the lengths due to the context window limit imposed by the model
+            truncated_lengths = [
+                min(self.max_position_embeddings, length + 2) for length in self.lengths
+            ]
+        # Calculate average sequence length
+        avg_seq_length = sum(truncated_lengths) // len(truncated_lengths)
+        # Estimate average number of sequences per batch
+        seqs_per_batch = self.max_tokens_per_batch // avg_seq_length
+        # Estimate total number of batches
+        if self.drop_last:
+            # If dropping last incomplete batch
+            return len(truncated_lengths) // seqs_per_batch
+        else:
+            # If keeping last incomplete batch, ensure at least 1 batch
+            return max(1, len(truncated_lengths) // seqs_per_batch)

cehrgpt/generation/generate_batch_hf_gpt_sequence.py CHANGED Viewed

@@ -93,9 +93,9 @@ def generate_single_batch(
             temperature=temperature,
             top_p=top_p,
             top_k=top_k,
-            bos_token_id=tokenizer.end_token_id,
-            eos_token_id=tokenizer.end_token_id,
-            pad_token_id=tokenizer.pad_token_id,
+            bos_token_id=model.generation_config.bos_token_id,
+            eos_token_id=model.generation_config.eos_token_id,
+            pad_token_id=model.generation_config.pad_token_id,
             do_sample=True,
             use_cache=True,
             return_dict_in_generate=True,
@@ -150,15 +150,11 @@ def main(args):
             attn_implementation=(
                 "flash_attention_2" if is_flash_attn_2_available() else "eager"
             ),
-            torch_dtype=(
-                torch.bfloat16
-                if is_flash_attn_2_available() and args.use_bfloat16
-                else torch.float32
-            ),
         )
         .eval()
         .to(device)
     )
     cehrgpt_model.generation_config.pad_token_id = cehrgpt_tokenizer.pad_token_id
     cehrgpt_model.generation_config.eos_token_id = cehrgpt_tokenizer.end_token_id
     cehrgpt_model.generation_config.bos_token_id = cehrgpt_tokenizer.end_token_id
@@ -192,6 +188,7 @@ def main(args):
     LOG.info(f"Top P {args.top_p}")
     LOG.info(f"Top K {args.top_k}")
     LOG.info(f"Loading demographic_info at {args.demographic_data_path}")
+    LOG.info(f"MEDS format: {args.meds_format}")
     dataset = load_parquet_as_dataset(args.demographic_data_path)
     total_rows = len(dataset)
@@ -199,6 +196,7 @@ def main(args):
     num_of_batches = args.num_of_patients // args.batch_size + 1
     sequence_to_flush = []
     current_person_id = 1
+    prompt_size = 2 if args.meds_format else START_TOKEN_SIZE
     for i in range(num_of_batches):
         LOG.info(f"{datetime.datetime.now()}: Batch {i} started")
@@ -215,7 +213,7 @@ def main(args):
                     <= max_seq_allowed
                 ):
                     random_prompts.append(
-                        cehrgpt_tokenizer.encode(row["concept_ids"][:START_TOKEN_SIZE])
+                        cehrgpt_tokenizer.encode(row["concept_ids"][:prompt_size])
                     )
                 iter += 1
                 if not random_prompts and iter > 10:
@@ -326,6 +324,11 @@ def create_arg_parser():
         dest="drop_long_sequences",
         action="store_true",
     )
+    base_arg_parser.add_argument(
+        "--meds_format",
+        dest="meds_format",
+        action="store_true",
+    )
     return base_arg_parser

cehrgpt/generation/omop_converter_batch.py CHANGED Viewed

@@ -60,6 +60,24 @@ OOV_CONCEPT_MAP = {
 }
+def extract_gender_concept_id(gender_token: str) -> int:
+    if gender_token.startswith("Gender/"):
+        return int(gender_token[len("Gender/") :])
+    elif gender_token.isnumeric():
+        return int(gender_token)
+    else:
+        return 0
+def extract_race_concept_id(race_token: str) -> int:
+    if race_token.startswith("Race/"):
+        return int(race_token[len("Race/") :])
+    elif race_token.isnumeric():
+        return int(race_token)
+    else:
+        return 0
 def create_folder_if_not_exists(output_folder, table_name):
     if not os.path.isdir(Path(output_folder) / table_name):
         os.mkdir(Path(output_folder) / table_name)
@@ -288,7 +306,13 @@ def gpt_to_omop_converter_batch(
         if int(birth_year) < 1900 or int(birth_year) > datetime.date.today().year:
             continue
-        p = Person(person_id, start_gender, birth_year, start_race)
+        p = Person(
+            person_id=person_id,
+            gender_concept_id=extract_gender_concept_id(start_gender),
+            year_of_birth=birth_year,
+            race_concept_id=extract_race_concept_id(start_race),
+        )
         append_to_dict(omop_export_dict, p, person_id)
         id_mappings_dict["person"][person_id] = person_id
         pt_seq_dict[person_id] = " ".join(concept_ids)
@@ -316,7 +340,12 @@ def gpt_to_omop_converter_batch(
                     id_mappings_dict["death"][person_id] = person_id
                 else:
                     try:
-                        visit_concept_id = int(clinical_events[event_idx + 1])
+                        if clinical_events[event_idx + 1].startswith("Visit/"):
+                            visit_concept_id = int(
+                                clinical_events[event_idx + 1][len("Visit/") :]
+                            )
+                        else:
+                            visit_concept_id = int(clinical_events[event_idx + 1])
                         inpatient_visit_indicator = visit_concept_id in [
                             9201,
                             262,
@@ -349,6 +378,7 @@ def gpt_to_omop_converter_batch(
                         visit_occurrence_id
                     ] = person_id
                     visit_occurrence_id += 1
             elif event in ATT_TIME_TOKENS:
                 if event[0] == "D":
                     att_date_delta = int(event[1:])

cehrgpt/gpt_utils.py CHANGED Viewed

@@ -11,6 +11,7 @@ from cehrgpt.models.special_tokens import (
 )
 # Regular expression pattern to match inpatient attendance tokens
+MEDS_CODE_PATTERN = re.compile(r".*/.*")
 INPATIENT_ATT_PATTERN = re.compile(r"(?:VS-|i-)D(\d+)(?:-VE)?")
 DEMOGRAPHIC_PROMPT_SIZE = 4
@@ -194,8 +195,12 @@ def get_cehrgpt_output_folder(args, cehrgpt_tokenizer) -> str:
     return folder_name
-def is_clinical_event(token: str) -> bool:
-    return token.isnumeric()
+def is_clinical_event(token: str, meds: bool = False) -> bool:
+    if token.isnumeric():
+        return True
+    if meds:
+        return bool(MEDS_CODE_PATTERN.match(token))
+    return False
 def is_visit_start(token: str):
@@ -212,6 +217,18 @@ def is_visit_end(token: str) -> bool:
     return token in ["VE", "[VE]"]
+def is_inpatient_hour_token(token: str) -> bool:
+    return token.startswith("i-H")
+def extract_time_interval_in_hours(token: str) -> int:
+    try:
+        hour = int(token[3:])
+        return hour
+    except ValueError:
+        return 0
 def is_att_token(token: str):
     """
     Check if the token is an attention token.
@@ -251,6 +268,7 @@ def is_artificial_token(token: str) -> bool:
         return True
     if token == END_TOKEN:
         return True
     return False

cehrgpt/models/config.py CHANGED Viewed

@@ -121,6 +121,7 @@ class CEHRGPTConfig(PretrainedConfig):
         bos_token_id=50256,
         eos_token_id=50256,
         lab_token_ids=None,
+        ve_token_id=None,
         scale_attn_by_inverse_layer_idx=False,
         reorder_and_upcast_attn=False,
         exclude_position_ids=False,
@@ -128,19 +129,27 @@ class CEHRGPTConfig(PretrainedConfig):
         value_vocab_size=None,
         include_ttv_prediction=False,
         use_sub_time_tokenization=True,
+        include_motor_time_to_event=True,
+        motor_tte_vocab_size=None,
+        motor_time_to_event_weight=1.0,
+        motor_num_time_pieces=16,
         token_to_time_token_mapping: Dict[int, List] = None,
         use_pretrained_embeddings=False,
         n_pretrained_embeddings_layers=2,
         pretrained_embedding_dim=768,
         pretrained_token_ids: List[int] = None,
+        next_token_prediction_loss_weight=1.0,
         time_token_loss_weight=1.0,
         time_to_visit_loss_weight=1.0,
         causal_sfm=False,
         demographics_size=4,
         lab_token_penalty=False,
         lab_token_loss_weight=0.9,
+        value_prediction_loss_weight=1.0,
         entropy_penalty=False,
         entropy_penalty_alpha=0.01,
+        sample_packing_max_positions=None,
+        class_weights=None,
         **kwargs,
     ):
         if token_to_time_token_mapping is None:
@@ -150,6 +159,11 @@ class CEHRGPTConfig(PretrainedConfig):
         self.vocab_size = vocab_size
         self.time_token_vocab_size = time_token_vocab_size
         self.n_positions = n_positions
+        self.sample_packing_max_positions = (
+            sample_packing_max_positions
+            if sample_packing_max_positions
+            else n_positions
+        )
         self.n_embd = n_embd
         self.n_layer = n_layer
         self.n_head = n_head
@@ -178,11 +192,28 @@ class CEHRGPTConfig(PretrainedConfig):
         self.include_values = include_values
         self.value_vocab_size = value_vocab_size
+        self.next_token_prediction_loss_weight = next_token_prediction_loss_weight
         self.include_ttv_prediction = include_ttv_prediction
         self.use_sub_time_tokenization = use_sub_time_tokenization
         self._token_to_time_token_mapping = token_to_time_token_mapping
         self.time_token_loss_weight = time_token_loss_weight
         self.time_to_visit_loss_weight = time_to_visit_loss_weight
+        # MOTOR TTE configuration
+        self.motor_tte_vocab_size = motor_tte_vocab_size
+        self.include_motor_time_to_event = (
+            include_motor_time_to_event
+            and self.motor_tte_vocab_size
+            and self.motor_tte_vocab_size > 0
+        )
+        if self.include_motor_time_to_event and not ve_token_id:
+            raise RuntimeError(
+                f"ve_token_id must be provided when include_motor_time_to_event is True"
+            )
+        self.ve_token_id = ve_token_id
+        self.motor_time_to_event_weight = motor_time_to_event_weight
+        self.motor_num_time_pieces = motor_num_time_pieces
         self.causal_sfm = causal_sfm
         self.demographics_size = demographics_size
         self.use_pretrained_embeddings = use_pretrained_embeddings
@@ -195,6 +226,10 @@ class CEHRGPTConfig(PretrainedConfig):
         self.lab_token_loss_weight = lab_token_loss_weight
         self.entropy_penalty = entropy_penalty
         self.entropy_penalty_alpha = entropy_penalty_alpha
+        self.value_prediction_loss_weight = value_prediction_loss_weight
+        # Class weights for fine-tuning
+        self.class_weights = class_weights
         kwargs["tie_word_embeddings"] = not use_pretrained_embeddings

cehrgpt 0.0.2__py3-none-any.whl → 0.1.1__py3-none-any.whl

cehrgpt 0.0.2py3-none-any.whl → 0.1.1py3-none-any.whl