PyPI - cehrgpt - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl - Mend

cehrgpt 0.1.0py3-none-any.whl → 0.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

cehrgpt/analysis/irregularity.py +36 -0
cehrgpt/data/hf_cehrgpt_dataset.py +1 -0
cehrgpt/data/hf_cehrgpt_dataset_collator.py +454 -68
cehrgpt/data/hf_cehrgpt_dataset_mapping.py +232 -17
cehrgpt/data/sample_packing_sampler.py +36 -6
cehrgpt/generation/cehrgpt_conditional_generation.py +314 -0
cehrgpt/generation/generate_batch_hf_gpt_sequence.py +15 -3
cehrgpt/generation/omop_converter_batch.py +32 -2
cehrgpt/gpt_utils.py +20 -2
cehrgpt/models/config.py +25 -0
cehrgpt/models/hf_cehrgpt.py +244 -39
cehrgpt/models/hf_modeling_outputs.py +1 -0
cehrgpt/models/special_tokens.py +1 -0
cehrgpt/models/tokenization_hf_cehrgpt.py +354 -71
cehrgpt/runners/data_utils.py +131 -5
cehrgpt/runners/hf_cehrgpt_finetune_runner.py +84 -51
cehrgpt/runners/hf_cehrgpt_pretrain_runner.py +59 -7
cehrgpt/runners/hf_gpt_runner_argument_dataclass.py +60 -0
cehrgpt/runners/hyperparameter_search_util.py +6 -7
cehrgpt/runners/sample_packing_trainer.py +17 -0
cehrgpt/time_to_event/config/1_year_cabg.yaml +23 -0
cehrgpt/time_to_event/time_to_event_model.py +2 -13
cehrgpt/time_to_event/time_to_event_prediction.py +27 -13
cehrgpt/tools/linear_prob/compute_cehrgpt_features.py +80 -62
{cehrgpt-0.1.0.dist-info → cehrgpt-0.1.2.dist-info}/METADATA +102 -7
{cehrgpt-0.1.0.dist-info → cehrgpt-0.1.2.dist-info}/RECORD +29 -26
{cehrgpt-0.1.0.dist-info → cehrgpt-0.1.2.dist-info}/WHEEL +1 -1
{cehrgpt-0.1.0.dist-info → cehrgpt-0.1.2.dist-info}/licenses/LICENSE +0 -0
{cehrgpt-0.1.0.dist-info → cehrgpt-0.1.2.dist-info}/top_level.txt +0 -0

cehrgpt/generation/cehrgpt_conditional_generation.py ADDED Viewed

@@ -0,0 +1,314 @@
+import datetime
+import os
+import random
+import shutil
+from pathlib import Path
+from typing import Any, Dict
+import numpy as np
+import polars as pl
+import torch
+import torch.distributed as dist
+from cehrbert.runners.runner_util import generate_prepared_ds_path
+from datasets import load_from_disk
+from meds import held_out_split, train_split, tuning_split
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers.trainer_utils import is_main_process
+from transformers.utils import is_flash_attn_2_available, logging
+from cehrgpt.data.hf_cehrgpt_dataset import create_cehrgpt_finetuning_dataset
+from cehrgpt.data.hf_cehrgpt_dataset_collator import CehrGptDataCollator
+from cehrgpt.generation.generate_batch_hf_gpt_sequence import (
+    generate_single_batch,
+    normalize_value,
+)
+from cehrgpt.gpt_utils import (
+    extract_time_interval_in_days,
+    extract_time_interval_in_hours,
+    is_att_token,
+    is_inpatient_hour_token,
+    is_visit_end,
+    is_visit_start,
+)
+from cehrgpt.models.hf_cehrgpt import CEHRGPT2LMHeadModel
+from cehrgpt.models.tokenization_hf_cehrgpt import CehrGptTokenizer
+from cehrgpt.runners.data_utils import (
+    extract_cohort_sequences,
+    prepare_finetune_dataset,
+)
+from cehrgpt.runners.gpt_runner_util import parse_runner_args
+from cehrgpt.runners.hf_cehrgpt_pretrain_runner import tokenizer_exists
+LOG = logging.get_logger("transformers")
+def map_data_split_name(split: str) -> str:
+    if split == "train":
+        return train_split
+    elif split == "validation":
+        return tuning_split
+    elif split == "test":
+        return held_out_split
+    raise ValueError(f"Unknown split: {split}")
+def seed_all(seed: int = 42):
+    """Set seed for Python, NumPy, and PyTorch (CPU & CUDA)."""
+    random.seed(seed)  # Python random
+    np.random.seed(seed)  # NumPy
+    torch.manual_seed(seed)  # PyTorch CPU
+    torch.cuda.manual_seed(seed)  # Current GPU
+    torch.cuda.manual_seed_all(seed)  # All GPUs
+    # For reproducibility in dataloader workers
+    os.environ["PYTHONHASHSEED"] = str(seed)
+def generate_trajectories_per_batch(
+    batch: Dict[str, Any],
+    cehrgpt_tokenizer: CehrGptTokenizer,
+    cehrgpt_model: CEHRGPT2LMHeadModel,
+    device,
+    data_output_path: Path,
+    max_length: int,
+):
+    subject_ids = batch["person_id"].squeeze().detach().cpu().tolist()
+    prediction_times = batch["index_date"].squeeze().detach().cpu().tolist()
+    batched_epoch_times = batch["epoch_times"].detach().cpu().tolist()
+    batched_input_ids = batch["input_ids"]
+    batched_value_indicators = batch["value_indicators"]
+    batched_values = batch["values"]
+    # Make sure the batch does not exceed batch_size
+    batch_sequences = generate_single_batch(
+        cehrgpt_model,
+        cehrgpt_tokenizer,
+        batched_input_ids,
+        values=batched_values,
+        value_indicators=batched_value_indicators,
+        max_length=max_length,
+        top_p=1.0,
+        top_k=cehrgpt_tokenizer.vocab_size,
+        device=device,
+    )
+    # Clear the cache
+    torch.cuda.empty_cache()
+    trajectories = []
+    for sample_i, (concept_ids, value_indicators, values) in enumerate(
+        zip(
+            batch_sequences["sequences"],
+            batch_sequences["value_indicators"],
+            batch_sequences["values"],
+        )
+    ):
+        (
+            concept_ids,
+            is_numeric_types,
+            number_as_values,
+            concept_as_values,
+            units,
+        ) = normalize_value(concept_ids, values, cehrgpt_tokenizer)
+        epoch_times = batched_epoch_times[sample_i]
+        input_length = len(epoch_times)
+        # Getting the last observed event time from the token before the prediction time
+        window_last_observed = epoch_times[input_length - 1]
+        current_cursor = epoch_times[-1]
+        generated_epoch_times = []
+        valid_indices = []
+        for i in range(input_length, len(concept_ids)):
+            concept_id = concept_ids[i]
+            # We use the left padding strategy in the data collator
+            if concept_id in [cehrgpt_tokenizer.pad_token, cehrgpt_tokenizer.end_token]:
+                continue
+            # We need to construct the time stamp
+            if is_att_token(concept_id):
+                current_cursor += extract_time_interval_in_days(concept_id) * 24 * 3600
+            elif is_inpatient_hour_token(concept_id):
+                current_cursor += extract_time_interval_in_hours(concept_id) * 3600
+            elif is_visit_start(concept_id) or is_visit_end(concept_id):
+                continue
+            else:
+                valid_indices.append(i)
+                generated_epoch_times.append(
+                    datetime.datetime.utcfromtimestamp(current_cursor).replace(
+                        tzinfo=None
+                    )
+                )
+        trajectories.append(
+            {
+                "subject_id": subject_ids[sample_i],
+                "prediction_time": datetime.datetime.utcfromtimestamp(
+                    prediction_times[sample_i]
+                ).replace(tzinfo=None),
+                "window_last_observed_time": datetime.datetime.utcfromtimestamp(
+                    window_last_observed
+                ).replace(tzinfo=None),
+                "times": generated_epoch_times,
+                "concept_ids": np.asarray(concept_ids)[valid_indices].tolist(),
+                "numeric_values": np.asarray(number_as_values)[valid_indices].tolist(),
+                "text_value": np.asarray(concept_as_values)[valid_indices].tolist(),
+                "units": np.asarray(units)[valid_indices].tolist(),
+            }
+        )
+    trajectories = (
+        pl.DataFrame(trajectories)
+        .explode(["times", "concept_ids", "numeric_values", "text_value", "units"])
+        .rename(
+            {
+                "times": "time",
+                "concept_ids": "code",
+                "numeric_values": "numeric_value",
+                "units": "unit",
+            }
+        )
+        .select(
+            "subject_id",
+            "prediction_time",
+            "window_last_observed_time",
+            "time",
+            "code",
+            "numeric_value",
+            "text_value",
+            "unit",
+        )
+    )
+    trajectories.write_parquet(data_output_path)
+def main():
+    cehrgpt_args, data_args, model_args, training_args = parse_runner_args()
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+    else:
+        device = torch.device("cpu")
+    cehrgpt_tokenizer = CehrGptTokenizer.from_pretrained(
+        model_args.tokenizer_name_or_path
+    )
+    cehrgpt_model = (
+        CEHRGPT2LMHeadModel.from_pretrained(
+            model_args.model_name_or_path,
+            attn_implementation=(
+                "flash_attention_2" if is_flash_attn_2_available() else "eager"
+            ),
+        )
+        .eval()
+        .to(device)
+    )
+    cehrgpt_model.generation_config.pad_token_id = cehrgpt_tokenizer.pad_token_id
+    cehrgpt_model.generation_config.eos_token_id = cehrgpt_tokenizer.end_token_id
+    cehrgpt_model.generation_config.bos_token_id = cehrgpt_tokenizer.end_token_id
+    if not os.path.exists(training_args.output_dir):
+        os.makedirs(training_args.output_dir)
+    prepared_ds_path = generate_prepared_ds_path(
+        data_args, model_args, data_folder=data_args.cohort_folder
+    )
+    processed_dataset = None
+    if any(prepared_ds_path.glob("*")):
+        LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
+        processed_dataset = load_from_disk(str(prepared_ds_path))
+        LOG.info("Prepared dataset loaded from disk...")
+        if cehrgpt_args.expand_tokenizer:
+            if tokenizer_exists(training_args.output_dir):
+                cehrgpt_tokenizer = CehrGptTokenizer.from_pretrained(
+                    training_args.output_dir
+                )
+            else:
+                LOG.warning(
+                    f"CehrGptTokenizer must exist in {training_args.output_dir} "
+                    f"when the dataset has been processed and expand_tokenizer is set to True. "
+                    f"Please delete the processed dataset at {prepared_ds_path}."
+                )
+                processed_dataset = None
+                shutil.rmtree(prepared_ds_path)
+    if processed_dataset is None and is_main_process(training_args.local_rank):
+        # If the full dataset has been tokenized, we don't want to tokenize the cohort containing
+        # the subset of the data. We should slice out the portion of the tokenized sequences for each sample
+        if cehrgpt_args.tokenized_full_dataset_path is not None:
+            processed_dataset = extract_cohort_sequences(data_args, cehrgpt_args)
+        else:
+            # Organize them into a single DatasetDict
+            final_splits = prepare_finetune_dataset(
+                data_args, training_args, cehrgpt_args
+            )
+            # TODO: temp solution, this column is mixed typed and causes an issue when transforming the data
+            if not data_args.streaming:
+                all_columns = final_splits["train"].column_names
+                if "visit_concept_ids" in all_columns:
+                    final_splits = final_splits.remove_columns(["visit_concept_ids"])
+            processed_dataset = create_cehrgpt_finetuning_dataset(
+                dataset=final_splits,
+                cehrgpt_tokenizer=cehrgpt_tokenizer,
+                data_args=data_args,
+            )
+        if not data_args.streaming:
+            processed_dataset.save_to_disk(prepared_ds_path)
+            processed_dataset.cleanup_cache_files()
+    # After main-process-only operations, synchronize all processes to ensure consistency
+    if dist.is_available() and dist.is_initialized():
+        dist.barrier()
+    # We suppress the additional learning objectives in fine-tuning
+    data_collator = CehrGptDataCollator(
+        tokenizer=cehrgpt_tokenizer,
+        max_length=cehrgpt_args.generation_input_length,
+        include_values=cehrgpt_model.config.include_values,
+        pretraining=False,
+        include_ttv_prediction=False,
+        use_sub_time_tokenization=False,
+        include_demographics=False,
+        add_linear_prob_token=False,
+    )
+    LOG.info(
+        "Generating %s trajectories per sample",
+        cehrgpt_args.num_of_trajectories_per_sample,
+    )
+    for sample_i in range(cehrgpt_args.num_of_trajectories_per_sample):
+        for split, dataset in processed_dataset.items():
+            meds_split = map_data_split_name(split)
+            dataloader = DataLoader(
+                dataset=dataset,
+                batch_size=training_args.per_device_eval_batch_size,
+                num_workers=training_args.dataloader_num_workers,
+                collate_fn=data_collator,
+                pin_memory=training_args.dataloader_pin_memory,
+            )
+            sample_output_dir = (
+                Path(training_args.output_dir) / meds_split / f"{sample_i}"
+            )
+            sample_output_dir.mkdir(exist_ok=True, parents=True)
+            for batch_i, batch in tqdm(
+                enumerate(dataloader),
+                desc=f"Generating Trajectories for split {meds_split} with trajectory {sample_i + 1}",
+            ):
+                output_parquet_file = sample_output_dir / f"{batch_i}.parquet"
+                if output_parquet_file.exists():
+                    LOG.info("%s already exists, skip...", output_parquet_file)
+                    continue
+                generate_trajectories_per_batch(
+                    batch,
+                    cehrgpt_tokenizer,
+                    cehrgpt_model,
+                    device,
+                    sample_output_dir / f"{batch_i}.parquet",
+                    cehrgpt_args.generation_max_new_tokens
+                    + cehrgpt_args.generation_input_length,
+                )
+if __name__ == "__main__":
+    # ✅ Call first thing inside main()
+    seed_all(42)
+    main()

cehrgpt/generation/generate_batch_hf_gpt_sequence.py CHANGED Viewed

@@ -74,7 +74,10 @@ def generate_single_batch(
     model: CEHRGPT2LMHeadModel,
     tokenizer: CehrGptTokenizer,
     prompts: List[List[int]],
-    max_new_tokens=512,
+    max_length: int,
+    values: Optional[torch.Tensor] = None,
+    value_indicators: Optional[torch.Tensor] = None,
+    max_new_tokens: Optional[int] = None,
     mini_num_of_concepts=1,
     top_p=0.95,
     top_k=50,
@@ -88,7 +91,8 @@ def generate_single_batch(
     with torch.no_grad():
         generation_config = GenerationConfig(
             repetition_penalty=repetition_penalty,
-            max_length=max_new_tokens,
+            max_new_tokens=max_new_tokens,
+            max_length=max_length,
             min_length=mini_num_of_concepts,
             temperature=temperature,
             top_p=top_p,
@@ -107,9 +111,17 @@ def generate_single_batch(
             num_beam_groups=num_beam_groups,
             epsilon_cutoff=epsilon_cutoff,
         )
         batched_prompts = torch.tensor(prompts).to(device)
+        if values is not None:
+            values = values.to(device)
+        if value_indicators is not None:
+            value_indicators = value_indicators.to(device)
         results = model.generate(
             inputs=batched_prompts,
+            values=values,
+            value_indicators=value_indicators,
             generation_config=generation_config,
             lab_token_ids=tokenizer.lab_token_ids,
         )
@@ -226,7 +238,7 @@ def main(args):
             cehrgpt_model,
             cehrgpt_tokenizer,
             random_prompts[: args.batch_size],
-            max_new_tokens=args.context_window,
+            max_length=args.context_window,
             mini_num_of_concepts=args.min_num_of_concepts,
             top_p=args.top_p,
             top_k=args.top_k,

cehrgpt/generation/omop_converter_batch.py CHANGED Viewed

@@ -60,6 +60,24 @@ OOV_CONCEPT_MAP = {
 }
+def extract_gender_concept_id(gender_token: str) -> int:
+    if gender_token.startswith("Gender/"):
+        return int(gender_token[len("Gender/") :])
+    elif gender_token.isnumeric():
+        return int(gender_token)
+    else:
+        return 0
+def extract_race_concept_id(race_token: str) -> int:
+    if race_token.startswith("Race/"):
+        return int(race_token[len("Race/") :])
+    elif race_token.isnumeric():
+        return int(race_token)
+    else:
+        return 0
 def create_folder_if_not_exists(output_folder, table_name):
     if not os.path.isdir(Path(output_folder) / table_name):
         os.mkdir(Path(output_folder) / table_name)
@@ -288,7 +306,13 @@ def gpt_to_omop_converter_batch(
         if int(birth_year) < 1900 or int(birth_year) > datetime.date.today().year:
             continue
-        p = Person(person_id, start_gender, birth_year, start_race)
+        p = Person(
+            person_id=person_id,
+            gender_concept_id=extract_gender_concept_id(start_gender),
+            year_of_birth=birth_year,
+            race_concept_id=extract_race_concept_id(start_race),
+        )
         append_to_dict(omop_export_dict, p, person_id)
         id_mappings_dict["person"][person_id] = person_id
         pt_seq_dict[person_id] = " ".join(concept_ids)
@@ -316,7 +340,12 @@ def gpt_to_omop_converter_batch(
                     id_mappings_dict["death"][person_id] = person_id
                 else:
                     try:
-                        visit_concept_id = int(clinical_events[event_idx + 1])
+                        if clinical_events[event_idx + 1].startswith("Visit/"):
+                            visit_concept_id = int(
+                                clinical_events[event_idx + 1][len("Visit/") :]
+                            )
+                        else:
+                            visit_concept_id = int(clinical_events[event_idx + 1])
                         inpatient_visit_indicator = visit_concept_id in [
                             9201,
                             262,
@@ -349,6 +378,7 @@ def gpt_to_omop_converter_batch(
                         visit_occurrence_id
                     ] = person_id
                     visit_occurrence_id += 1
             elif event in ATT_TIME_TOKENS:
                 if event[0] == "D":
                     att_date_delta = int(event[1:])

cehrgpt/gpt_utils.py CHANGED Viewed

@@ -11,6 +11,7 @@ from cehrgpt.models.special_tokens import (
 )
 # Regular expression pattern to match inpatient attendance tokens
+MEDS_CODE_PATTERN = re.compile(r".*/.*")
 INPATIENT_ATT_PATTERN = re.compile(r"(?:VS-|i-)D(\d+)(?:-VE)?")
 DEMOGRAPHIC_PROMPT_SIZE = 4
@@ -194,8 +195,12 @@ def get_cehrgpt_output_folder(args, cehrgpt_tokenizer) -> str:
     return folder_name
-def is_clinical_event(token: str) -> bool:
-    return token.isnumeric()
+def is_clinical_event(token: str, meds: bool = False) -> bool:
+    if token.isnumeric():
+        return True
+    if meds:
+        return bool(MEDS_CODE_PATTERN.match(token))
+    return False
 def is_visit_start(token: str):
@@ -212,6 +217,18 @@ def is_visit_end(token: str) -> bool:
     return token in ["VE", "[VE]"]
+def is_inpatient_hour_token(token: str) -> bool:
+    return token.startswith("i-H")
+def extract_time_interval_in_hours(token: str) -> int:
+    try:
+        hour = int(token[3:])
+        return hour
+    except ValueError:
+        return 0
 def is_att_token(token: str):
     """
     Check if the token is an attention token.
@@ -251,6 +268,7 @@ def is_artificial_token(token: str) -> bool:
         return True
     if token == END_TOKEN:
         return True
     return False

cehrgpt/models/config.py CHANGED Viewed

@@ -121,6 +121,7 @@ class CEHRGPTConfig(PretrainedConfig):
         bos_token_id=50256,
         eos_token_id=50256,
         lab_token_ids=None,
+        ve_token_id=None,
         scale_attn_by_inverse_layer_idx=False,
         reorder_and_upcast_attn=False,
         exclude_position_ids=False,
@@ -128,6 +129,10 @@ class CEHRGPTConfig(PretrainedConfig):
         value_vocab_size=None,
         include_ttv_prediction=False,
         use_sub_time_tokenization=True,
+        include_motor_time_to_event=True,
+        motor_tte_vocab_size=None,
+        motor_time_to_event_weight=1.0,
+        motor_num_time_pieces=16,
         token_to_time_token_mapping: Dict[int, List] = None,
         use_pretrained_embeddings=False,
         n_pretrained_embeddings_layers=2,
@@ -144,6 +149,7 @@ class CEHRGPTConfig(PretrainedConfig):
         entropy_penalty=False,
         entropy_penalty_alpha=0.01,
         sample_packing_max_positions=None,
+        class_weights=None,
         **kwargs,
     ):
         if token_to_time_token_mapping is None:
@@ -192,6 +198,22 @@ class CEHRGPTConfig(PretrainedConfig):
         self._token_to_time_token_mapping = token_to_time_token_mapping
         self.time_token_loss_weight = time_token_loss_weight
         self.time_to_visit_loss_weight = time_to_visit_loss_weight
+        # MOTOR TTE configuration
+        self.motor_tte_vocab_size = motor_tte_vocab_size
+        self.include_motor_time_to_event = (
+            include_motor_time_to_event
+            and self.motor_tte_vocab_size
+            and self.motor_tte_vocab_size > 0
+        )
+        if self.include_motor_time_to_event and not ve_token_id:
+            raise RuntimeError(
+                f"ve_token_id must be provided when include_motor_time_to_event is True"
+            )
+        self.ve_token_id = ve_token_id
+        self.motor_time_to_event_weight = motor_time_to_event_weight
+        self.motor_num_time_pieces = motor_num_time_pieces
         self.causal_sfm = causal_sfm
         self.demographics_size = demographics_size
         self.use_pretrained_embeddings = use_pretrained_embeddings
@@ -206,6 +228,9 @@ class CEHRGPTConfig(PretrainedConfig):
         self.entropy_penalty_alpha = entropy_penalty_alpha
         self.value_prediction_loss_weight = value_prediction_loss_weight
+        # Class weights for fine-tuning
+        self.class_weights = class_weights
         kwargs["tie_word_embeddings"] = not use_pretrained_embeddings
         super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

cehrgpt 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

cehrgpt 0.1.0py3-none-any.whl → 0.1.2py3-none-any.whl