PyPI - cehrgpt - Versions diffs - 0.0.2__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

cehrgpt 0.0.2py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

cehrgpt/analysis/irregularity.py +36 -0
cehrgpt/data/hf_cehrgpt_dataset.py +25 -4
cehrgpt/data/hf_cehrgpt_dataset_collator.py +635 -97
cehrgpt/data/hf_cehrgpt_dataset_mapping.py +308 -95
cehrgpt/data/sample_packing_sampler.py +181 -0
cehrgpt/generation/generate_batch_hf_gpt_sequence.py +12 -9
cehrgpt/generation/omop_converter_batch.py +32 -2
cehrgpt/gpt_utils.py +20 -2
cehrgpt/models/config.py +35 -0
cehrgpt/models/hf_cehrgpt.py +470 -106
cehrgpt/models/hf_modeling_outputs.py +1 -0
cehrgpt/models/special_tokens.py +1 -0
cehrgpt/models/tokenization_hf_cehrgpt.py +358 -71
cehrgpt/runners/data_utils.py +358 -0
cehrgpt/runners/gpt_runner_util.py +0 -10
cehrgpt/runners/hf_cehrgpt_finetune_runner.py +181 -283
cehrgpt/runners/hf_cehrgpt_pretrain_runner.py +288 -112
cehrgpt/runners/hf_gpt_runner_argument_dataclass.py +90 -0
cehrgpt/runners/hyperparameter_search_util.py +10 -8
cehrgpt/runners/sample_packing_trainer.py +185 -0
cehrgpt/simulations/generate_plots.py +95 -0
cehrgpt/simulations/run_simulation.sh +24 -0
cehrgpt/simulations/time_embedding_simulation.py +250 -0
cehrgpt/simulations/time_token_simulation.py +177 -0
cehrgpt/time_to_event/config/1_year_cabg.yaml +23 -0
cehrgpt/time_to_event/time_to_event_model.py +2 -13
cehrgpt/time_to_event/time_to_event_prediction.py +27 -13
cehrgpt/tools/linear_prob/__init__.py +0 -0
cehrgpt/tools/linear_prob/compute_cehrgpt_features.py +495 -0
cehrgpt/tools/linear_prob/train_with_cehrgpt_features.py +152 -0
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info}/METADATA +11 -8
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info}/RECORD +36 -32
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info}/WHEEL +1 -1
cehrgpt/data/hf_cehrgpt_dpo_collator.py +0 -71
cehrgpt/data/hf_cehrgpt_dpo_dataset_mapping.py +0 -61
cehrgpt/generation/generate_paired_cehrgpt_sequence.py +0 -224
cehrgpt/rl_finetune/cehrgpt_dpo_trainer.py +0 -586
cehrgpt/rl_finetune/cehrgpt_ppo_trainer.py +0 -464
cehrgpt/rl_finetune/ppo_finetune.py +0 -394
cehrgpt/rl_finetune/ppo_finetune_v2.py +0 -373
cehrgpt/runners/hf_cehrgpt_dpo_runner.py +0 -119
/cehrgpt/{rl_finetune → simulations}/__init__.py +0 -0
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info/licenses}/LICENSE +0 -0
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info}/top_level.txt +0 -0

cehrgpt/runners/hf_cehrgpt_pretrain_runner.py CHANGED Viewed

@@ -1,8 +1,12 @@
 import os
+from functools import partial
 from typing import Optional, Union
+import numpy as np
 import torch
+import torch.distributed as dist
 from cehrbert.data_generators.hf_data_generator.meds_utils import (
+    CacheFileCollector,
     create_dataset_from_meds_reader,
 )
 from cehrbert.runners.hf_runner_argument_dataclass import (
@@ -16,22 +20,42 @@ from cehrbert.runners.runner_util import (
     load_parquet_as_dataset,
 )
 from datasets import Dataset, DatasetDict, IterableDatasetDict, load_from_disk
-from transformers import AutoConfig, Trainer, TrainingArguments, set_seed
+from transformers import EarlyStoppingCallback, Trainer, TrainingArguments, set_seed
+from transformers.trainer_utils import is_main_process
 from transformers.utils import is_flash_attn_2_available, logging
 from cehrgpt.data.hf_cehrgpt_dataset import create_cehrgpt_pretraining_dataset
-from cehrgpt.data.hf_cehrgpt_dataset_collator import CehrGptDataCollator
+from cehrgpt.data.hf_cehrgpt_dataset_collator import (
+    CehrGptDataCollator,
+    SamplePackingCehrGptDataCollator,
+)
 from cehrgpt.data.hf_cehrgpt_dataset_mapping import MedToCehrGPTDatasetMapping
 from cehrgpt.models.config import CEHRGPTConfig
 from cehrgpt.models.hf_cehrgpt import CEHRGPT2LMHeadModel
 from cehrgpt.models.pretrained_embeddings import PretrainedEmbeddings
 from cehrgpt.models.tokenization_hf_cehrgpt import CehrGptTokenizer
+from cehrgpt.runners.data_utils import get_torch_dtype
 from cehrgpt.runners.gpt_runner_util import parse_runner_args
 from cehrgpt.runners.hf_gpt_runner_argument_dataclass import CehrGPTArguments
+from cehrgpt.runners.sample_packing_trainer import SamplePackingTrainer
 LOG = logging.get_logger("transformers")
+class CustomEarlyStoppingCallback(EarlyStoppingCallback):
+    def check_metric_value(self, args, state, control, metric_value):
+        # best_metric is set by code for load_best_model
+        operator = np.greater if args.greater_is_better else np.less
+        if state.best_metric is None or (
+            operator(metric_value, state.best_metric)
+            and abs(metric_value - state.best_metric) / state.best_metric
+            > self.early_stopping_threshold
+        ):
+            self.early_stopping_patience_counter = 0
+        else:
+            self.early_stopping_patience_counter += 1
 def tokenizer_exists(tokenizer_name_or_path: str) -> bool:
     # Try to load the pretrained tokenizer
     try:
@@ -48,6 +72,36 @@ def load_and_create_tokenizer(
     cehrgpt_args: CehrGPTArguments,
     dataset: Optional[Union[Dataset, DatasetDict]] = None,
 ) -> CehrGptTokenizer:
+    concept_name_mapping = {}
+    allowed_motor_codes = list()
+    if cehrgpt_args.concept_dir:
+        import pandas as pd
+        from cehrbert_data.const.artificial_tokens import DEATH_TOKEN
+        from meds.schema import death_code
+        LOG.info("Loading concept data from disk at %s", cehrgpt_args.concept_dir)
+        concept_pd = pd.read_parquet(cehrgpt_args.concept_dir)
+        LOG.info(
+            "Creating concept name mapping and motor_time_to_event_codes from disk at %s",
+            cehrgpt_args.concept_dir,
+        )
+        for row in concept_pd.itertuples():
+            concept_name_mapping[str(getattr(row, "concept_id"))] = getattr(
+                row, "concept_name"
+            )
+            if (
+                cehrgpt_args.include_motor_time_to_event
+                and getattr(row, "domain_id")
+                in ["Condition", "Procedure", "Drug", "Visit"]
+                and getattr(row, "standard_concept") == "S"
+            ):
+                allowed_motor_codes.append(str(getattr(row, "concept_id")))
+        LOG.info(
+            "Adding death codes for MOTOR TTE predictions: %s",
+            [DEATH_TOKEN, death_code],
+        )
+        allowed_motor_codes.extend([DEATH_TOKEN, death_code])
     # Try to load the pretrained tokenizer
     tokenizer_abspath = os.path.expanduser(model_args.tokenizer_name_or_path)
     try:
@@ -59,13 +113,24 @@ def load_and_create_tokenizer(
                 f"Failed to load the tokenizer from {tokenizer_abspath} with the error \n{e}\n"
                 f"Tried to create the tokenizer, however the dataset is not provided."
             )
+        LOG.info("Started training the tokenizer ...")
         tokenizer = CehrGptTokenizer.train_tokenizer(
             dataset,
-            {},
+            concept_name_mapping,
             data_args,
             PretrainedEmbeddings(cehrgpt_args.pretrained_embedding_path),
+            allowed_motor_codes if cehrgpt_args.include_motor_time_to_event else None,
+            (
+                cehrgpt_args.num_motor_tasks
+                if cehrgpt_args.include_motor_time_to_event
+                else None
+            ),
+            apply_entropy_filter=cehrgpt_args.apply_entropy_filter,
+            min_prevalence=cehrgpt_args.min_prevalence,
         )
+        LOG.info("Finished training the tokenizer ...")
         tokenizer.save_pretrained(tokenizer_abspath)
+        LOG.info("Saved the tokenizer to %s", tokenizer_abspath)
     return tokenizer
@@ -73,13 +138,12 @@ def load_and_create_tokenizer(
 def load_and_create_model(
     model_args: ModelArguments,
     cehrgpt_args: CehrGPTArguments,
-    training_args: TrainingArguments,
     tokenizer: CehrGptTokenizer,
 ) -> CEHRGPT2LMHeadModel:
     attn_implementation = (
         "flash_attention_2" if is_flash_attn_2_available() else "eager"
     )
-    torch_dtype = torch.bfloat16 if training_args.bf16 else torch.float32
+    torch_dtype = get_torch_dtype(model_args.torch_dtype)
     model_abspath = os.path.expanduser(model_args.model_name_or_path)
     if cehrgpt_args.continue_pretrain:
         try:
@@ -120,6 +184,9 @@ def load_and_create_model(
             pretrained_embedding_dim = tokenizer.pretrained_embeddings.shape[1]
         else:
             pretrained_embedding_dim = model_args.hidden_size
+        model_args_cehrgpt = model_args.as_dict()
+        model_args_cehrgpt.pop("attn_implementation")
         model_config = CEHRGPTConfig(
             vocab_size=tokenizer.vocab_size,
             value_vocab_size=tokenizer.value_vocab_size,
@@ -131,15 +198,28 @@ def load_and_create_model(
             attn_implementation=attn_implementation,
             causal_sfm=cehrgpt_args.causal_sfm,
             demographics_size=cehrgpt_args.demographics_size,
+            next_token_prediction_loss_weight=cehrgpt_args.next_token_prediction_loss_weight,
             lab_token_penalty=cehrgpt_args.lab_token_penalty,
             lab_token_loss_weight=cehrgpt_args.lab_token_loss_weight,
+            value_prediction_loss_weight=cehrgpt_args.value_prediction_loss_weight,
             entropy_penalty=cehrgpt_args.entropy_penalty,
             entropy_penalty_alpha=cehrgpt_args.entropy_penalty_alpha,
             n_pretrained_embeddings_layers=cehrgpt_args.n_pretrained_embeddings_layers,
             use_pretrained_embeddings=len(tokenizer.pretrained_token_ids) > 0,
             pretrained_embedding_dim=pretrained_embedding_dim,
-            **model_args.as_dict(),
+            sample_packing_max_positions=(
+                cehrgpt_args.max_tokens_per_batch
+                if cehrgpt_args.sample_packing
+                else model_args.max_position_embeddings
+            ),
+            include_motor_time_to_event=cehrgpt_args.include_motor_time_to_event,
+            motor_tte_vocab_size=tokenizer.motor_tte_vocab_size,
+            motor_time_to_event_weight=cehrgpt_args.motor_time_to_event_weight,
+            motor_num_time_pieces=cehrgpt_args.motor_num_time_pieces,
+            ve_token_id=tokenizer.ve_token_id,
+            **model_args_cehrgpt,
         )
     model = CEHRGPT2LMHeadModel(model_config)
     if tokenizer.pretrained_token_ids:
         model.cehrgpt.update_pretrained_embeddings(
@@ -156,6 +236,11 @@ def load_and_create_model(
 def main():
     cehrgpt_args, data_args, model_args, training_args = parse_runner_args()
+    if cehrgpt_args.sample_packing and data_args.streaming:
+        raise RuntimeError(
+            f"sample_packing is not supported when streaming is enabled, please set streaming to False"
+        )
     if data_args.streaming:
         # This is for disabling the warning message https://github.com/huggingface/transformers/issues/5486
         # This happens only when streaming is enabled
@@ -165,6 +250,8 @@ def main():
         training_args.dataloader_num_workers = 0
         training_args.dataloader_prefetch_factor = None
+    processed_dataset: Optional[DatasetDict] = None
+    cache_file_collector = CacheFileCollector()
     prepared_ds_path = generate_prepared_ds_path(data_args, model_args)
     if os.path.exists(os.path.join(data_args.data_folder, "dataset_dict.json")):
         LOG.info(f"Loading prepared dataset from disk at {data_args.data_folder}...")
@@ -200,118 +287,160 @@ def main():
             )
         cehrgpt_tokenizer = CehrGptTokenizer.from_pretrained(tokenizer_name_or_path)
     else:
-        # If the data is in the MEDS format, we need to convert it to the CEHR-BERT format
-        if data_args.is_data_in_meds:
-            meds_extension_path = get_meds_extension_path(
-                data_folder=data_args.data_folder,
-                dataset_prepared_path=data_args.dataset_prepared_path,
-            )
-            try:
-                LOG.info(
-                    "Trying to load the MEDS extension from disk at %s...",
-                    meds_extension_path,
+        # Only run tokenization and data transformation in the main process in torch distributed training
+        # otherwise the multiple processes will create tokenizers at the same time
+        if is_main_process(training_args.local_rank):
+            # If the data is in the MEDS format, we need to convert it to the CEHR-BERT format
+            if data_args.is_data_in_meds:
+                meds_extension_path = get_meds_extension_path(
+                    data_folder=data_args.data_folder,
+                    dataset_prepared_path=data_args.dataset_prepared_path,
                 )
-                dataset = load_from_disk(meds_extension_path)
-                if data_args.streaming:
-                    if isinstance(dataset, DatasetDict):
-                        dataset = {
-                            k: v.to_iterable_dataset(
+                try:
+                    LOG.info(
+                        "Trying to load the MEDS extension from disk at %s...",
+                        meds_extension_path,
+                    )
+                    dataset = load_from_disk(meds_extension_path)
+                    if data_args.streaming:
+                        if isinstance(dataset, DatasetDict):
+                            dataset = {
+                                k: v.to_iterable_dataset(
+                                    num_shards=training_args.dataloader_num_workers
+                                )
+                                for k, v in dataset.items()
+                            }
+                        else:
+                            dataset = dataset.to_iterable_dataset(
                                 num_shards=training_args.dataloader_num_workers
                             )
-                            for k, v in dataset.items()
-                        }
-                    else:
-                        dataset = dataset.to_iterable_dataset(
-                            num_shards=training_args.dataloader_num_workers
-                        )
-            except FileNotFoundError as e:
-                LOG.exception(e)
-                dataset = create_dataset_from_meds_reader(
-                    data_args=data_args,
-                    dataset_mappings=[
-                        MedToCehrGPTDatasetMapping(
-                            data_args=data_args,
-                            is_pretraining=True,
-                            include_inpatient_hour_token=cehrgpt_args.include_inpatient_hour_token,
+                except FileNotFoundError as e:
+                    LOG.warning(e)
+                    dataset = create_dataset_from_meds_reader(
+                        data_args=data_args,
+                        dataset_mappings=[
+                            MedToCehrGPTDatasetMapping(
+                                data_args=data_args,
+                                include_inpatient_hour_token=cehrgpt_args.include_inpatient_hour_token,
+                            )
+                        ],
+                        cache_file_collector=cache_file_collector,
+                    )
+                    if not data_args.streaming:
+                        dataset.save_to_disk(str(meds_extension_path))
+                        stats = dataset.cleanup_cache_files()
+                        LOG.info(
+                            "Clean up the cached files for the cehrgpt dataset transformed from the MEDS: %s",
+                            stats,
                         )
-                    ],
+                        # Clean up the files created from the data generator
+                        cache_file_collector.remove_cache_files()
+                        dataset = load_from_disk(str(meds_extension_path))
+            else:
+                # Load the dataset from the parquet files
+                dataset = load_parquet_as_dataset(
+                    os.path.expanduser(data_args.data_folder),
+                    split="train",
+                    streaming=data_args.streaming,
                 )
-                if not data_args.streaming:
-                    dataset.save_to_disk(str(meds_extension_path))
-                    stats = dataset.cleanup_cache_files()
-                    LOG.info(
-                        "Clean up the cached files for the cehrgpt dataset transformed from the MEDS: %s",
-                        stats,
+                # If streaming is enabled, we need to manually split the data into train/val
+                if data_args.streaming and data_args.validation_split_num:
+                    dataset = dataset.shuffle(
+                        buffer_size=10_000, seed=training_args.seed
                     )
-                    dataset = load_from_disk(str(meds_extension_path))
-        else:
-            # Load the dataset from the parquet files
-            dataset = load_parquet_as_dataset(
-                os.path.expanduser(data_args.data_folder),
-                split="train",
-                streaming=data_args.streaming,
+                    train_set = dataset.skip(data_args.validation_split_num)
+                    val_set = dataset.take(data_args.validation_split_num)
+                    dataset = DatasetDict({"train": train_set, "validation": val_set})
+                elif data_args.validation_split_percentage:
+                    dataset = dataset.train_test_split(
+                        test_size=data_args.validation_split_percentage,
+                        seed=training_args.seed,
+                    )
+                    dataset = DatasetDict(
+                        {"train": dataset["train"], "validation": dataset["test"]}
+                    )
+                else:
+                    raise RuntimeError(
+                        f"Can not split the data. If streaming is enabled, validation_split_num needs to be "
+                        f"defined, otherwise validation_split_percentage needs to be provided. "
+                        f"The current values are:\n"
+                        f"validation_split_percentage: {data_args.validation_split_percentage}\n"
+                        f"validation_split_num: {data_args.validation_split_num}\n"
+                        f"streaming: {data_args.streaming}"
+                    )
+            # Create the CEHR-GPT tokenizer if it's not available in the output folder
+            cehrgpt_tokenizer = load_and_create_tokenizer(
+                data_args=data_args,
+                model_args=model_args,
+                cehrgpt_args=cehrgpt_args,
+                dataset=dataset,
             )
-            # If streaming is enabled, we need to manually split the data into train/val
-            if data_args.streaming and data_args.validation_split_num:
-                dataset = dataset.shuffle(buffer_size=10_000, seed=training_args.seed)
-                train_set = dataset.skip(data_args.validation_split_num)
-                val_set = dataset.take(data_args.validation_split_num)
-                dataset = DatasetDict({"train": train_set, "test": val_set})
-            elif data_args.validation_split_percentage:
-                dataset = dataset.train_test_split(
-                    test_size=data_args.validation_split_percentage,
-                    seed=training_args.seed,
-                )
-            else:
-                raise RuntimeError(
-                    f"Can not split the data. If streaming is enabled, validation_split_num needs to be "
-                    f"defined, otherwise validation_split_percentage needs to be provided. "
-                    f"The current values are:\n"
-                    f"validation_split_percentage: {data_args.validation_split_percentage}\n"
-                    f"validation_split_num: {data_args.validation_split_num}\n"
-                    f"streaming: {data_args.streaming}"
-                )
-        # Create the CEHR-GPT tokenizer if it's not available in the output folder
-        cehrgpt_tokenizer = load_and_create_tokenizer(
-            data_args=data_args,
-            model_args=model_args,
-            cehrgpt_args=cehrgpt_args,
-            dataset=dataset,
-        )
-        # Retrain the tokenizer in case we want to pretrain the model further using different datasets
-        if cehrgpt_args.expand_tokenizer:
-            new_tokenizer_path = os.path.expanduser(training_args.output_dir)
-            try:
-                cehrgpt_tokenizer = CehrGptTokenizer.from_pretrained(new_tokenizer_path)
-            except Exception:
-                cehrgpt_tokenizer = CehrGptTokenizer.expand_trained_tokenizer(
-                    cehrgpt_tokenizer=cehrgpt_tokenizer,
-                    dataset=dataset["train"],
-                    data_args=data_args,
-                    concept_name_mapping={},
-                    pretrained_concept_embedding_model=PretrainedEmbeddings(
-                        cehrgpt_args.pretrained_embedding_path
-                    ),
-                )
-                cehrgpt_tokenizer.save_pretrained(
-                    os.path.expanduser(training_args.output_dir)
+            # Retrain the tokenizer in case we want to pretrain the model further using different datasets
+            if cehrgpt_args.expand_tokenizer:
+                new_tokenizer_path = os.path.expanduser(training_args.output_dir)
+                try:
+                    cehrgpt_tokenizer = CehrGptTokenizer.from_pretrained(
+                        new_tokenizer_path
+                    )
+                except Exception:
+                    cehrgpt_tokenizer = CehrGptTokenizer.expand_trained_tokenizer(
+                        cehrgpt_tokenizer=cehrgpt_tokenizer,
+                        dataset=dataset["train"],
+                        data_args=data_args,
+                        concept_name_mapping={},
+                        pretrained_concept_embedding_model=PretrainedEmbeddings(
+                            cehrgpt_args.pretrained_embedding_path
+                        ),
+                        apply_entropy_filter=cehrgpt_args.apply_entropy_filter,
+                        min_prevalence=cehrgpt_args.min_prevalence,
+                    )
+                    cehrgpt_tokenizer.save_pretrained(
+                        os.path.expanduser(training_args.output_dir)
+                    )
+            # TODO: temp solution, this column is mixed typed and causes an issue when transforming the data
+            if not data_args.streaming:
+                all_columns = dataset["train"].column_names
+                if "visit_concept_ids" in all_columns:
+                    dataset = dataset.remove_columns(["visit_concept_ids"])
+            # sort the patient features chronologically and tokenize the data
+            processed_dataset = create_cehrgpt_pretraining_dataset(
+                dataset=dataset,
+                cehrgpt_tokenizer=cehrgpt_tokenizer,
+                data_args=data_args,
+                cache_file_collector=cache_file_collector,
+            )
+            # only save the data to the disk if it is not streaming
+            if not data_args.streaming:
+                processed_dataset.save_to_disk(str(prepared_ds_path))
+                stats = processed_dataset.cleanup_cache_files()
+                LOG.info(
+                    "Clean up the cached files for the cehrgpt pretraining dataset: %s",
+                    stats,
                 )
+            cache_file_collector.remove_cache_files()
+        # After main-process-only operations, synchronize all processes to ensure consistency
+        if dist.is_available() and dist.is_initialized():
+            dist.barrier()
-        # sort the patient features chronologically and tokenize the data
-        processed_dataset = create_cehrgpt_pretraining_dataset(
-            dataset=dataset, cehrgpt_tokenizer=cehrgpt_tokenizer, data_args=data_args
+        # Loading tokenizer in all processes in torch distributed training
+        tokenizer_name_or_path = os.path.expanduser(
+            training_args.output_dir
+            if cehrgpt_args.expand_tokenizer
+            else model_args.tokenizer_name_or_path
         )
-        # only save the data to the disk if it is not streaming
+        cehrgpt_tokenizer = CehrGptTokenizer.from_pretrained(tokenizer_name_or_path)
+        # Load the dataset from disk again to in torch distributed training
         if not data_args.streaming:
-            processed_dataset.save_to_disk(str(prepared_ds_path))
-            stats = processed_dataset.cleanup_cache_files()
-            LOG.info(
-                "Clean up the cached files for the cehrgpt pretraining dataset: %s",
-                stats,
-            )
             processed_dataset = load_from_disk(str(prepared_ds_path))
+    if processed_dataset is None:
+        raise RuntimeError("The processed dataset cannot be None")
     def filter_func(examples):
         if cehrgpt_args.drop_long_sequences:
             return [
@@ -339,9 +468,11 @@ def main():
     else:
         processed_dataset = processed_dataset.filter(filter_func, **filter_args)
-    model = load_and_create_model(
-        model_args, cehrgpt_args, training_args, cehrgpt_tokenizer
-    )
+    model = load_and_create_model(model_args, cehrgpt_args, cehrgpt_tokenizer)
+    # Try to update motor tte vocab size if the new configuration is different from the existing one
+    if cehrgpt_args.include_motor_time_to_event:
+        model.update_motor_tte_vocab_size(cehrgpt_tokenizer.motor_tte_vocab_size)
     # Expand tokenizer to adapt to the new pretraining dataset
     if model.config.vocab_size < cehrgpt_tokenizer.vocab_size:
@@ -369,22 +500,67 @@ def main():
     # Set seed before initializing model.
     set_seed(training_args.seed)
-    if not data_args.streaming:
+    if not data_args.streaming and not cehrgpt_args.sample_packing:
         processed_dataset.set_format("pt")
-    trainer = Trainer(
+    callbacks = []
+    if cehrgpt_args.use_early_stopping:
+        callbacks.append(
+            CustomEarlyStoppingCallback(
+                model_args.early_stopping_patience,
+                cehrgpt_args.early_stopping_threshold,
+            )
+        )
+    if cehrgpt_args.sample_packing:
+        trainer_class = partial(
+            SamplePackingTrainer,
+            max_tokens_per_batch=cehrgpt_args.max_tokens_per_batch,
+            max_position_embeddings=model_args.max_position_embeddings,
+            train_lengths=processed_dataset["train"]["num_of_concepts"],
+            validation_lengths=(
+                processed_dataset["validation"]
+                if "validation" in processed_dataset
+                else processed_dataset["test"]
+            )["num_of_concepts"],
+        )
+        training_args.per_device_train_batch_size = 1
+        training_args.per_device_eval_batch_size = 1
+        data_collator_fn = partial(
+            SamplePackingCehrGptDataCollator,
+            cehrgpt_args.max_tokens_per_batch,
+            model_args.max_position_embeddings,
+            add_end_token_in_sample_packing=cehrgpt_args.add_end_token_in_sample_packing,
+        )
+    else:
+        trainer_class = Trainer
+        data_collator_fn = CehrGptDataCollator
+    trainer = trainer_class(
         model=model,
-        data_collator=CehrGptDataCollator(
+        data_collator=data_collator_fn(
             tokenizer=cehrgpt_tokenizer,
-            max_length=model_args.max_position_embeddings,
+            max_length=(
+                cehrgpt_args.max_tokens_per_batch
+                if cehrgpt_args.sample_packing
+                else model_args.max_position_embeddings
+            ),
             shuffle_records=data_args.shuffle_records,
             include_ttv_prediction=model_args.include_ttv_prediction,
             use_sub_time_tokenization=model_args.use_sub_time_tokenization,
             include_values=model_args.include_values,
+            include_motor_time_to_event=cehrgpt_args.include_motor_time_to_event,
+            motor_tte_vocab_size=model.config.motor_tte_vocab_size,
+            motor_num_time_pieces=cehrgpt_args.motor_num_time_pieces,
         ),
         train_dataset=processed_dataset["train"],
-        eval_dataset=processed_dataset["test"],
+        eval_dataset=(
+            processed_dataset["validation"]
+            if "validation" in processed_dataset
+            else processed_dataset["test"]
+        ),
         args=training_args,
+        callbacks=callbacks,
     )
     checkpoint = None

cehrgpt/runners/hf_gpt_runner_argument_dataclass.py CHANGED Viewed

@@ -6,6 +6,12 @@ from typing import List, Optional
 class CehrGPTArguments:
     """Arguments pertaining to what data we are going to input our model for training and eval."""
+    tokenized_full_dataset_path: Optional[str] = dataclasses.field(
+        default=None,
+        metadata={
+            "help": "The path to the tokenized dataset created for the full population"
+        },
+    )
     include_inpatient_hour_token: Optional[bool] = dataclasses.field(
         default=True,
         metadata={"help": "Include inpatient hour token"},
@@ -115,6 +121,9 @@ class CehrGPTArguments:
             "help": "The lower bound of the learning rate range for hyperparameter tuning."
         },
     )
+    next_token_prediction_loss_weight: float = dataclasses.field(
+        default=1.0, metadata={"help": "The weight of the next token prediction loss"}
+    )
     lab_token_penalty: Optional[bool] = dataclasses.field(
         default=False,
         metadata={
@@ -125,6 +134,10 @@ class CehrGPTArguments:
         default=1.0,
         metadata={"help": "lab_token_loss_weight penalty co-efficient"},
     )
+    value_prediction_loss_weight: Optional[float] = dataclasses.field(
+        default=1.0,
+        metadata={"help": "The weight of the value prediction loss"},
+    )
     entropy_penalty: Optional[bool] = dataclasses.field(
         default=False,
         metadata={"help": "A flag to indicate whether we want to use entropy penalty."},
@@ -139,3 +152,80 @@ class CehrGPTArguments:
             "help": "The number of feed forward layers for transforming pretrained embeddings to internal embeddings"
         },
     )
+    meds_repartition: Optional[bool] = dataclasses.field(
+        default=False,
+        metadata={
+            "help": "A flag to indicate whether we want to repartition the meds train tune sets"
+        },
+    )
+    use_early_stopping: Optional[bool] = dataclasses.field(
+        default=True,
+        metadata={"help": "A flag to indicate whether we want to use early stopping."},
+    )
+    early_stopping_threshold: Optional[float] = dataclasses.field(
+        default=0.01,
+        metadata={
+            "help": "A threshold to denote how much the specified metric must improve to satisfy early stopping conditions."
+        },
+    )
+    sample_packing: Optional[bool] = dataclasses.field(
+        default=False,
+        metadata={
+            "help": "A flag to indicate whether we want to use sample packing for efficient training."
+        },
+    )
+    max_tokens_per_batch: int = dataclasses.field(
+        default=16384, metadata={"help": "Maximum number of tokens in each batch"}
+    )
+    add_end_token_in_sample_packing: Optional[bool] = dataclasses.field(
+        default=False,
+        metadata={
+            "help": "A flag to indicate whether we want to add end token in sample packing"
+        },
+    )
+    include_motor_time_to_event: Optional[bool] = dataclasses.field(
+        default=False,
+        metadata={
+            "help": "A flag to indicate whether we want to include the motor time to events"
+        },
+    )
+    num_motor_tasks: Optional[int] = dataclasses.field(
+        default=10000,
+        metadata={"help": "The number of max MOTOR tasks"},
+    )
+    motor_time_to_event_weight: Optional[float] = dataclasses.field(
+        default=1.0,
+        metadata={"help": "The MOTOR time to event loss weight"},
+    )
+    motor_num_time_pieces: Optional[int] = dataclasses.field(
+        default=8,
+        metadata={
+            "help": "The number of times each motor_num_time_pieces piece has to be"
+        },
+    )
+    concept_dir: Optional[str] = dataclasses.field(
+        default=None,
+        metadata={"help": "The directory where the concept data is stored."},
+    )
+    average_over_sequence: bool = dataclasses.field(
+        default=False,
+        metadata={"help": "Whether or not to average tokens per sequence"},
+    )
+    apply_entropy_filter: Optional[bool] = dataclasses.field(
+        default=False,
+        metadata={"help": "A flag to indicate whether we want to use entropy filter."},
+    )
+    min_prevalence: Optional[float] = dataclasses.field(
+        default=1 / 1000,
+        metadata={"help": "The min_prevalence to keep the concepts in the tokenizer"},
+    )
+    class_weights: Optional[List[int]] = dataclasses.field(
+        default=None,
+        metadata={"help": "The class weights for training"},
+    )
+    negative_sampling_probability: Optional[float] = dataclasses.field(
+        default=None,
+        metadata={
+            "help": "The probability of negative samples will be included in the training data"
+        },
+    )

cehrgpt 0.0.2__py3-none-any.whl → 0.1.1__py3-none-any.whl

cehrgpt 0.0.2py3-none-any.whl → 0.1.1py3-none-any.whl