PyPI - cehrgpt - Versions diffs - 0.0.2__py3-none-any.whl → 0.1.0__py3-none-any.whl - Mend

cehrgpt 0.0.2py3-none-any.whl → 0.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

cehrgpt/data/hf_cehrgpt_dataset.py +24 -4
cehrgpt/data/hf_cehrgpt_dataset_collator.py +260 -84
cehrgpt/data/hf_cehrgpt_dataset_mapping.py +99 -88
cehrgpt/data/sample_packing_sampler.py +151 -0
cehrgpt/generation/generate_batch_hf_gpt_sequence.py +12 -9
cehrgpt/models/config.py +10 -0
cehrgpt/models/hf_cehrgpt.py +243 -73
cehrgpt/models/tokenization_hf_cehrgpt.py +4 -0
cehrgpt/runners/data_utils.py +243 -0
cehrgpt/runners/gpt_runner_util.py +0 -10
cehrgpt/runners/hf_cehrgpt_finetune_runner.py +152 -279
cehrgpt/runners/hf_cehrgpt_pretrain_runner.py +229 -105
cehrgpt/runners/hf_gpt_runner_argument_dataclass.py +42 -0
cehrgpt/runners/hyperparameter_search_util.py +4 -1
cehrgpt/runners/sample_packing_trainer.py +168 -0
cehrgpt/simulations/generate_plots.py +95 -0
cehrgpt/simulations/run_simulation.sh +24 -0
cehrgpt/simulations/time_embedding_simulation.py +250 -0
cehrgpt/simulations/time_token_simulation.py +177 -0
cehrgpt/tools/linear_prob/__init__.py +0 -0
cehrgpt/tools/linear_prob/compute_cehrgpt_features.py +467 -0
cehrgpt/tools/linear_prob/train_with_cehrgpt_features.py +152 -0
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.0.dist-info}/METADATA +7 -5
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.0.dist-info}/RECORD +28 -26
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.0.dist-info}/WHEEL +1 -1
cehrgpt/data/hf_cehrgpt_dpo_collator.py +0 -71
cehrgpt/data/hf_cehrgpt_dpo_dataset_mapping.py +0 -61
cehrgpt/generation/generate_paired_cehrgpt_sequence.py +0 -224
cehrgpt/rl_finetune/cehrgpt_dpo_trainer.py +0 -586
cehrgpt/rl_finetune/cehrgpt_ppo_trainer.py +0 -464
cehrgpt/rl_finetune/ppo_finetune.py +0 -394
cehrgpt/rl_finetune/ppo_finetune_v2.py +0 -373
cehrgpt/runners/hf_cehrgpt_dpo_runner.py +0 -119
/cehrgpt/{rl_finetune → simulations}/__init__.py +0 -0
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.0.dist-info/licenses}/LICENSE +0 -0
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.0.dist-info}/top_level.txt +0 -0

cehrgpt/data/hf_cehrgpt_dataset.py CHANGED Viewed

@@ -1,9 +1,10 @@
-from typing import Union
+from typing import Optional, Union
 from cehrbert.data_generators.hf_data_generator.hf_dataset import (
     FINETUNING_COLUMNS,
     apply_cehrbert_dataset_mapping,
 )
+from cehrbert.data_generators.hf_data_generator.meds_utils import CacheFileCollector
 from cehrbert.runners.hf_runner_argument_dataclass import DataTrainingArguments
 from datasets import Dataset, DatasetDict
@@ -31,16 +32,25 @@ def create_cehrgpt_pretraining_dataset(
     dataset: Union[Dataset, DatasetDict],
     cehrgpt_tokenizer: CehrGptTokenizer,
     data_args: DataTrainingArguments,
-) -> Dataset:
+    cache_file_collector: Optional[CacheFileCollector] = None,
+) -> Union[Dataset, DatasetDict]:
     required_columns = TRANSFORMER_COLUMNS + CEHRGPT_COLUMNS
+    # TODO: temp solution, this column is mixed typed and causes an issue when transforming the data
+    if not data_args.streaming:
+        if isinstance(dataset, DatasetDict):
+            all_columns = dataset["train"].column_names
+        else:
+            all_columns = dataset.column_names
+        if "visit_concept_ids" in all_columns:
+            dataset.remove_columns(["visit_concept_ids"])
     dataset = apply_cehrbert_dataset_mapping(
         dataset,
         HFCehrGptTokenizationMapping(cehrgpt_tokenizer),
         num_proc=data_args.preprocessing_num_workers,
         batch_size=data_args.preprocessing_batch_size,
         streaming=data_args.streaming,
+        cache_file_collector=cache_file_collector,
     )
     if not data_args.streaming:
         if isinstance(dataset, DatasetDict):
             all_columns = dataset["train"].column_names
@@ -56,8 +66,17 @@ def create_cehrgpt_finetuning_dataset(
     dataset: Union[Dataset, DatasetDict],
     cehrgpt_tokenizer: CehrGptTokenizer,
     data_args: DataTrainingArguments,
-) -> Dataset:
+    cache_file_collector: Optional[CacheFileCollector] = None,
+) -> Union[Dataset, DatasetDict]:
     required_columns = TRANSFORMER_COLUMNS + CEHRGPT_COLUMNS + FINETUNING_COLUMNS
+    # TODO: temp solution, this column is mixed typed and causes an issue when transforming the data
+    if not data_args.streaming:
+        if isinstance(dataset, DatasetDict):
+            all_columns = dataset["train"].column_names
+        else:
+            all_columns = dataset.column_names
+        if "visit_concept_ids" in all_columns:
+            dataset.remove_columns(["visit_concept_ids"])
     mapping_functions = [
         HFFineTuningMapping(cehrgpt_tokenizer),
     ]
@@ -68,6 +87,7 @@ def create_cehrgpt_finetuning_dataset(
             num_proc=data_args.preprocessing_num_workers,
             batch_size=data_args.preprocessing_batch_size,
             streaming=data_args.streaming,
+            cache_file_collector=cache_file_collector,
         )
     if not data_args.streaming:

cehrgpt/data/hf_cehrgpt_dataset_collator.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import random
-from typing import Any, Dict
+from typing import Any, Dict, Optional
 import numpy as np
 import torch
@@ -105,9 +105,12 @@ class CehrGptDataCollator:
             self._try_reverse_tensor(self._convert_to_tensor(example["input_ids"]))
             for example in examples
         ]
         batch_attention_mask = [
             self._try_reverse_tensor(
-                torch.ones_like(
+                self._convert_to_tensor(example["attention_mask"]).to(torch.float)
+                if "attention_mask" in example
+                else torch.ones_like(
                     self._convert_to_tensor(example["input_ids"]), dtype=torch.float
                 )
             )
@@ -128,16 +131,40 @@ class CehrGptDataCollator:
         )
         assert batch["input_ids"].shape[1] <= self.max_length
         assert batch["attention_mask"].shape[1] <= self.max_length
+        assert batch["attention_mask"].shape[1] == batch["input_ids"].shape[1], (
+            f'batch["attention_mask"].shape[1]: {batch["attention_mask"].shape[1]}, '
+            f'batch["input_ids"].shape[1]: {batch["input_ids"].shape[1]}'
+        )
+        assert batch["input_ids"].max() < self.tokenizer.vocab_size, (
+            f"batch['input_ids'].max(): {batch['input_ids'].max()} must be smaller than "
+            f"self.tokenizer.vocab_size: {self.tokenizer.vocab_size}. "
+            f"batch['input_ids']: {batch['input_ids']} "
+        )
-        if self.pretraining:
-            batch["labels"] = self._try_reverse_tensor(
+        if "position_ids" in examples[0]:
+            batch_position_ids = [
+                self._try_reverse_tensor(
+                    self._convert_to_tensor(example["position_ids"])
+                )
+                for example in examples
+            ]
+            # Pad sequences to the max length in the batch
+            batch["position_ids"] = self._try_reverse_tensor(
                 pad_sequence(
-                    batch_input_ids,
+                    batch_position_ids,
                     batch_first=True,
-                    padding_value=-100,
+                    padding_value=self.max_length,
                 ).to(torch.int64)
             )
+        if self.pretraining:
+            batch["labels"] = torch.where(
+                (batch["input_ids"] != self.tokenizer.pad_token_id)
+                & batch["attention_mask"].to(torch.bool),
+                batch["input_ids"],
+                -100,
+            )
         if self.use_sub_time_tokenization:
             time_token_indicators = torch.isin(batch["input_ids"], self.time_tokens)
             masked_tokens = batch["input_ids"].clone()
@@ -170,7 +197,7 @@ class CehrGptDataCollator:
         if self.include_values:
             batch_value_indicators = [
                 self._try_reverse_tensor(
-                    self._convert_to_tensor(example["value_indicators"])
+                    self._convert_to_tensor(example["value_indicators"]).to(torch.bool)
                 )
                 for example in examples
             ]
@@ -178,7 +205,6 @@ class CehrGptDataCollator:
                 self._try_reverse_tensor(self._convert_to_tensor(example["values"]))
                 for example in examples
             ]
             batch["value_indicators"] = self._try_reverse_tensor(
                 pad_sequence(
                     batch_value_indicators, batch_first=True, padding_value=False
@@ -200,41 +226,58 @@ class CehrGptDataCollator:
                     batch["value_indicators"], batch["values"].clone(), -100
                 )
+        bz = len(examples)
         if "person_id" in examples[0]:
-            batch["person_id"] = torch.cat(
-                [
-                    self._convert_to_tensor(example["person_id"]).reshape(-1, 1)
-                    for example in examples
-                ],
-                dim=0,
-            ).to(torch.int32)
+            batch["person_id"] = (
+                torch.cat(
+                    [
+                        self._convert_to_tensor(example["person_id"]).reshape(-1, 1)
+                        for example in examples
+                    ],
+                    dim=0,
+                )
+                .to(torch.int32)
+                .reshape(bz, -1)
+            )
         if "index_date" in examples[0]:
             batch["index_date"] = torch.cat(
                 [
-                    self._convert_to_tensor(example["index_date"]).reshape(-1, 1)
+                    torch.tensor(example["index_date"], dtype=torch.float64).reshape(
+                        -1, 1
+                    )
                     for example in examples
                 ],
                 dim=0,
-            ).to(torch.float32)
+            ).reshape(bz, -1)
         if "age_at_index" in examples[0]:
-            batch["age_at_index"] = torch.cat(
-                [
-                    self._convert_to_tensor(example["age_at_index"]).reshape(-1, 1)
-                    for example in examples
-                ],
-                dim=0,
-            ).to(torch.float32)
+            batch["age_at_index"] = (
+                torch.cat(
+                    [
+                        self._convert_to_tensor(example["age_at_index"]).reshape(-1, 1)
+                        for example in examples
+                    ],
+                    dim=0,
+                )
+                .to(torch.float32)
+                .reshape(bz, -1)
+            )
         if "classifier_label" in examples[0]:
-            batch["classifier_label"] = torch.cat(
-                [
-                    self._convert_to_tensor(example["classifier_label"]).reshape(-1, 1)
-                    for example in examples
-                ],
-                dim=0,
-            ).to(torch.float32)
+            batch["classifier_label"] = (
+                torch.cat(
+                    [
+                        self._convert_to_tensor(example["classifier_label"]).reshape(
+                            -1, 1
+                        )
+                        for example in examples
+                    ],
+                    dim=0,
+                )
+                .to(torch.float32)
+                .reshape(bz, -1)
+            )
         return batch
@@ -273,53 +316,69 @@ class CehrGptDataCollator:
             record["input_ids"] = self._convert_to_tensor(sorted_input_ids)
         return record
-    def generate_start_end_index(self, record: Dict[str, Any]) -> Dict[str, Any]:
+    def generate_start_end_index(
+        self, record: Dict[str, Any], max_length_allowed: Optional[int] = None
+    ) -> Dict[str, Any]:
         """Adding the start and end indices to extract a portion of the patient sequence."""
         # concept_ids will be used to for time to event predictions and identifying the visit starts
+        max_length_allowed = (
+            self.max_length if max_length_allowed is None else max_length_allowed
+        )
+        sample_packing = getattr(self, "sample_packing", False)
         input_ids = record["input_ids"]
         if isinstance(input_ids, torch.Tensor):
             input_ids = input_ids.detach().tolist()
         concept_ids = self.tokenizer.decode(input_ids, skip_special_tokens=False)
         seq_length = len(record["input_ids"])
-        new_max_length = self.max_length - 1  # Subtract one for the [END] token
+        # Subtract one for the [END] token when sample_packing is not enabled
+        new_max_length = (
+            max_length_allowed if sample_packing else max_length_allowed - 1
+        )
+        if self.include_ttv_prediction:
+            record["time_to_visits"] = torch.concat(
+                [self._convert_to_tensor(self._convert_time_to_event(concept_ids))]
+            )
         # Return the record directly if the actual sequence length is less than the max sequence
         if seq_length <= new_max_length:
-            record["input_ids"] = torch.concat(
-                [
-                    self._convert_to_tensor(record["input_ids"]),
-                    self._convert_to_tensor([self.tokenizer.end_token_id]),
-                ]
-            )
-            if self.include_values:
-                record["value_indicators"] = torch.concat(
-                    [
-                        self._convert_to_tensor(record["value_indicators"]),
-                        self._convert_to_tensor([False]),
-                    ]
-                ).to(torch.bool)
-                record["values"] = torch.concat(
-                    [
-                        self._convert_to_tensor(record["values"]),
-                        self._convert_to_tensor([self.tokenizer.pad_value_token_id]),
-                    ]
-                )
-            if self.include_ttv_prediction:
-                record["time_to_visits"] = torch.concat(
+            if not sample_packing:
+                record["input_ids"] = torch.concat(
                     [
-                        self._convert_to_tensor(
-                            self._convert_time_to_event(concept_ids)
-                        ),
-                        self._convert_to_tensor([-100.0]),
+                        self._convert_to_tensor(record["input_ids"]),
+                        self._convert_to_tensor([self.tokenizer.end_token_id]),
                     ]
                 )
+                if self.include_values:
+                    record["value_indicators"] = torch.concat(
+                        [
+                            self._convert_to_tensor(record["value_indicators"]),
+                            self._convert_to_tensor([False]),
+                        ]
+                    ).to(torch.bool)
+                    record["values"] = torch.concat(
+                        [
+                            self._convert_to_tensor(record["values"]),
+                            self._convert_to_tensor(
+                                [self.tokenizer.pad_value_token_id]
+                            ),
+                        ]
+                    )
+                if self.include_ttv_prediction:
+                    record["time_to_visits"] = torch.concat(
+                        [
+                            record["time_to_visits"],
+                            self._convert_to_tensor([-100.0]),
+                        ]
+                    )
             return record
         if self.pretraining:
             # There is a 50% chance we randomly slice out a portion of the patient history and update the demographic
             # prompt depending on the new starting point
-            if random.random() < 0.5:
+            if random.random() < 0.5 and not sample_packing:
                 start_index, end_index, demographic_tokens = random_slice_gpt_sequence(
                     concept_ids, new_max_length
                 )
@@ -351,6 +410,11 @@ class CehrGptDataCollator:
                     break
             record["input_ids"] = record["input_ids"][0:end_index]
+            # We want to make sure we take the subset of attention_mask in sample packing if this field is available
+            if sample_packing and "attention_mask" in record:
+                record["attention_mask"] = record["attention_mask"][0:end_index]
             if self.include_values:
                 record["value_indicators"] = self._convert_to_tensor(
                     record["value_indicators"][0:end_index]
@@ -364,7 +428,7 @@ class CehrGptDataCollator:
                 )
             return record
         else:
-            if self.include_demographics:
+            if self.include_demographics and not sample_packing:
                 # We employ a left truncation strategy, where the most recent patient history is reserved for fine-tuning
                 demographic_prompts_at_visits = collect_demographic_prompts_at_visits(
                     concept_ids
@@ -427,6 +491,10 @@ class CehrGptDataCollator:
                     current_token = record["input_ids"][i]
                     if current_token == self.vs_token_id:
                         record["input_ids"] = record["input_ids"][i:end_index]
+                        if sample_packing and "attention_mask" in record:
+                            record["attention_mask"] = record["attention_mask"][
+                                i:end_index
+                            ]
                         if self.include_values:
                             record["value_indicators"] = record["value_indicators"][
                                 i:end_index
@@ -442,6 +510,10 @@ class CehrGptDataCollator:
             # We simply take the last new_max_length number of tokens from the patient sequence
             if len(record["input_ids"]) > new_max_length:
                 record["input_ids"] = record["input_ids"][-new_max_length:]
+                if sample_packing and "attention_mask" in record:
+                    record["attention_mask"] = record["attention_mask"][
+                        -new_max_length:
+                    ]
                 if self.include_values:
                     record["value_indicators"] = record["value_indicators"][
                         -new_max_length:
@@ -452,31 +524,135 @@ class CehrGptDataCollator:
                         -new_max_length:
                     ]
-            # Finally we add the end token to the end of the sequence
-            record["input_ids"] = torch.concat(
-                [
-                    self._convert_to_tensor(record["input_ids"]),
-                    self._convert_to_tensor([self.tokenizer.end_token_id]),
-                ]
-            )
-            if self.include_values:
-                record["value_indicators"] = torch.concat(
-                    [
-                        self._convert_to_tensor(record["value_indicators"]),
-                        self._convert_to_tensor([False]),
-                    ]
-                ).to(torch.bool)
-                record["values"] = torch.concat(
-                    [
-                        self._convert_to_tensor(record["values"]),
-                        self._convert_to_tensor([self.tokenizer.pad_value_token_id]),
-                    ]
-                )
-            if self.include_ttv_prediction:
-                record["time_to_visits"] = torch.concat(
+            if not sample_packing:
+                # Finally we add the end token to the end of the sequence
+                record["input_ids"] = torch.concat(
                     [
-                        record["time_to_visits"],
-                        self._convert_to_tensor([-100.0]),
+                        self._convert_to_tensor(record["input_ids"]),
+                        self._convert_to_tensor([self.tokenizer.end_token_id]),
                     ]
                 )
+                if self.include_values:
+                    record["value_indicators"] = torch.concat(
+                        [
+                            self._convert_to_tensor(record["value_indicators"]),
+                            self._convert_to_tensor([False]),
+                        ]
+                    ).to(torch.bool)
+                    record["values"] = torch.concat(
+                        [
+                            self._convert_to_tensor(record["values"]),
+                            self._convert_to_tensor(
+                                [self.tokenizer.pad_value_token_id]
+                            ),
+                        ]
+                    )
+                if self.include_ttv_prediction:
+                    record["time_to_visits"] = torch.concat(
+                        [
+                            record["time_to_visits"],
+                            self._convert_to_tensor([-100.0]),
+                        ]
+                    )
             return record
+class SamplePackingCehrGptDataCollator(CehrGptDataCollator):
+    def __init__(self, max_tokens, max_position_embeddings, *args, **kwargs):
+        self.max_tokens_per_batch = max_tokens
+        self.max_position_embeddings = max_position_embeddings
+        self.sample_packing = True
+        self.add_end_token_in_sample_packing = kwargs.pop(
+            "add_end_token_in_sample_packing", False
+        )
+        super(SamplePackingCehrGptDataCollator, self).__init__(*args, **kwargs)
+    def __call__(self, examples):
+        current_input_ids = []
+        current_attention_mask = []
+        current_position_ids = []
+        current_value_indicators = []
+        current_values = []
+        # Demographics
+        current_person_ids = []
+        current_index_dates = []
+        # Binary classification inputs
+        current_ages = []
+        current_labels = []
+        for idx, example in enumerate(examples):
+            # If the sample length exceeds the model's capacity, truncate this example
+            add_end_token = (
+                len(example["input_ids"]) <= self.max_position_embeddings
+                and self.add_end_token_in_sample_packing
+            )
+            if len(example["input_ids"]) > self.max_position_embeddings:
+                example = self.generate_start_end_index(
+                    example, self.max_position_embeddings
+                )
+            input_ids = example["input_ids"]
+            # We add [END] [PAD], we want to attend to [END], adding [END] is important for sequence generation.
+            # If the sequence length of the sequence is less than the context window, we add both [END][PAD], otherwise
+            # we only add [PAD] token to the end of the sequence because it's not finished
+            current_input_ids.extend(
+                list(input_ids)
+                + (
+                    [self.tokenizer.end_token_id, self.tokenizer.pad_token_id]
+                    if add_end_token
+                    else [self.tokenizer.pad_token_id]
+                )
+            )
+            current_attention_mask.extend(
+                np.ones_like(input_ids).tolist() + ([1, 0] if add_end_token else [0])
+            )
+            num_tokens_to_pad = 1 + int(add_end_token)
+            current_position_ids.extend(list(range(len(input_ids) + num_tokens_to_pad)))
+            if self.include_values:
+                current_value_indicators.extend(
+                    list(example["value_indicators"]) + [False] * num_tokens_to_pad
+                )
+                current_values.extend(
+                    list(example["values"])
+                    + [self.tokenizer.pad_value_token_id] * num_tokens_to_pad
+                )
+            if "person_id" in example:
+                current_person_ids.append(example["person_id"])
+            if "index_date" in example:
+                current_index_dates.append(example["index_date"])
+            if "age_at_index" in example:
+                current_ages.append(example["age_at_index"])
+            if "classifier_label" in example:
+                current_labels.append(example["classifier_label"])
+        assert (
+            len(current_input_ids) <= self.max_tokens_per_batch
+        ), f"the total number of tokens in the packed sequence should be less than { self.max_tokens_per_batch}"
+        packed_example = {
+            "input_ids": current_input_ids,
+            "attention_mask": current_attention_mask,
+            "position_ids": current_position_ids,
+        }
+        if self.include_values:
+            packed_example.update({"value_indicators": current_value_indicators})
+            packed_example.update({"values": current_values})
+        if current_labels:
+            packed_example.update(
+                {
+                    "person_id": current_person_ids,
+                    "index_date": current_index_dates,
+                    "age_at_index": current_ages,
+                    "classifier_label": current_labels,
+                }
+            )
+        return super().__call__([packed_example])

cehrgpt 0.0.2__py3-none-any.whl → 0.1.0__py3-none-any.whl

cehrgpt 0.0.2py3-none-any.whl → 0.1.0py3-none-any.whl