PyPI - cehrgpt - Versions diffs - 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl - Mend

cehrgpt 0.1.2py3-none-any.whl → 0.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

cehrgpt/analysis/htn_treatment_pathway.py +546 -0
cehrgpt/analysis/treatment_pathway/__init__.py +0 -0
cehrgpt/analysis/treatment_pathway/depression_treatment_pathway.py +94 -0
cehrgpt/analysis/treatment_pathway/diabetes_treatment_pathway.py +94 -0
cehrgpt/analysis/treatment_pathway/htn_treatment_pathway.py +94 -0
cehrgpt/analysis/treatment_pathway/treatment_pathway.py +631 -0
cehrgpt/data/cehrgpt_data_processor.py +549 -0
cehrgpt/data/hf_cehrgpt_dataset.py +4 -0
cehrgpt/data/hf_cehrgpt_dataset_collator.py +285 -652
cehrgpt/data/hf_cehrgpt_dataset_mapping.py +38 -5
cehrgpt/generation/cehrgpt_conditional_generation.py +2 -0
cehrgpt/generation/generate_batch_hf_gpt_sequence.py +20 -12
cehrgpt/generation/omop_converter_batch.py +11 -4
cehrgpt/gpt_utils.py +73 -3
cehrgpt/models/activations.py +27 -0
cehrgpt/models/config.py +6 -2
cehrgpt/models/gpt2.py +560 -0
cehrgpt/models/hf_cehrgpt.py +183 -460
cehrgpt/models/tokenization_hf_cehrgpt.py +380 -50
cehrgpt/omop/ontology.py +154 -0
cehrgpt/runners/hf_cehrgpt_finetune_runner.py +24 -78
cehrgpt/runners/hf_cehrgpt_pretrain_runner.py +48 -44
cehrgpt/runners/hf_gpt_runner_argument_dataclass.py +46 -34
cehrgpt/runners/hyperparameter_search_util.py +180 -69
cehrgpt/runners/sample_packing_trainer.py +11 -2
cehrgpt/tools/linear_prob/compute_cehrgpt_features.py +8 -2
cehrgpt-0.1.3.dist-info/METADATA +238 -0
{cehrgpt-0.1.2.dist-info → cehrgpt-0.1.3.dist-info}/RECORD +32 -22
cehrgpt-0.1.2.dist-info/METADATA +0 -209
/cehrgpt/tools/{merge_synthetic_real_dataasets.py → merge_synthetic_real_datasets.py} +0 -0
{cehrgpt-0.1.2.dist-info → cehrgpt-0.1.3.dist-info}/WHEEL +0 -0
{cehrgpt-0.1.2.dist-info → cehrgpt-0.1.3.dist-info}/licenses/LICENSE +0 -0
{cehrgpt-0.1.2.dist-info → cehrgpt-0.1.3.dist-info}/top_level.txt +0 -0

cehrgpt/data/hf_cehrgpt_dataset_mapping.py CHANGED Viewed

@@ -28,6 +28,12 @@ from datasets.formatting.formatting import LazyBatch
 from dateutil.relativedelta import relativedelta
 from pandas import Series
+from cehrgpt.gpt_utils import (
+    construct_age_sequence,
+    construct_time_sequence,
+    encode_demographics,
+    multiple_of_10,
+)
 from cehrgpt.models.tokenization_hf_cehrgpt import (
     NONE_BIN,
     UNKNOWN_BIN,
@@ -43,6 +49,7 @@ CEHRGPT_COLUMNS = [
     "concept_values",
     "units",
     "epoch_times",
+    "ages",
 ]
@@ -121,6 +128,7 @@ class MedToCehrGPTDatasetMapping(DatasetMappingDecorator):
         cehrgpt_record: Dict[str, Any],
         code: str,
         time: datetime.datetime,
+        age: int,
         concept_value_mask: int = 0,
         number_as_value: float = 0.0,
         concept_as_value: str = "0",
@@ -128,6 +136,7 @@ class MedToCehrGPTDatasetMapping(DatasetMappingDecorator):
         unit: str = NA,
     ) -> None:
         cehrgpt_record["concept_ids"].append(replace_escape_chars(code))
+        cehrgpt_record["ages"].append(age)
         cehrgpt_record["concept_value_masks"].append(concept_value_mask)
         cehrgpt_record["number_as_values"].append(number_as_value)
         cehrgpt_record["concept_as_values"].append(concept_as_value)
@@ -141,6 +150,7 @@ class MedToCehrGPTDatasetMapping(DatasetMappingDecorator):
         cehrgpt_record = {
             "person_id": record["patient_id"],
             "concept_ids": [],
+            "ages": [],
             "concept_value_masks": [],
             "number_as_values": [],
             "concept_as_values": [],
@@ -168,14 +178,21 @@ class MedToCehrGPTDatasetMapping(DatasetMappingDecorator):
         first_visit_start_datetime: datetime.datetime = get_value(
             first_visit, "visit_start_datetime"
         )
+        starting_age = relativedelta(first_visit_start_datetime, birth_datetime).years
         year_str = f"year:{str(first_visit_start_datetime.year)}"
-        age_str = f"age:{str(relativedelta(first_visit_start_datetime, birth_datetime).years)}"
+        age_str = f"age:{starting_age}"
+        self._update_cehrgpt_record(
+            cehrgpt_record, year_str, first_visit_start_datetime, starting_age
+        )
+        self._update_cehrgpt_record(
+            cehrgpt_record, age_str, first_visit_start_datetime, starting_age
+        )
         self._update_cehrgpt_record(
-            cehrgpt_record, year_str, first_visit_start_datetime
+            cehrgpt_record, gender, first_visit_start_datetime, starting_age
+        )
+        self._update_cehrgpt_record(
+            cehrgpt_record, race, first_visit_start_datetime, starting_age
         )
-        self._update_cehrgpt_record(cehrgpt_record, age_str, first_visit_start_datetime)
-        self._update_cehrgpt_record(cehrgpt_record, gender, first_visit_start_datetime)
-        self._update_cehrgpt_record(cehrgpt_record, race, first_visit_start_datetime)
         # Use a data cursor to keep track of time
         datetime_cursor: Optional[datetime.datetime] = None
@@ -211,6 +228,7 @@ class MedToCehrGPTDatasetMapping(DatasetMappingDecorator):
                     cehrgpt_record,
                     code=self._time_token_function(time_delta),
                     time=visit_start_datetime,
+                    age=relativedelta(datetime_cursor, birth_datetime).years,
                 )
             datetime_cursor = visit_start_datetime
@@ -219,12 +237,14 @@ class MedToCehrGPTDatasetMapping(DatasetMappingDecorator):
                 cehrgpt_record,
                 code="[VS]",
                 time=datetime_cursor,
+                age=relativedelta(datetime_cursor, birth_datetime).years,
             )
             # Add a visit type token
             self._update_cehrgpt_record(
                 cehrgpt_record,
                 code=visit_type,
                 time=datetime_cursor,
+                age=relativedelta(datetime_cursor, birth_datetime).years,
             )
             # We need to insert an inpatient hour token right after the visit type, we calculate the hour interval
             # with respect to the midnight of the day
@@ -235,6 +255,7 @@ class MedToCehrGPTDatasetMapping(DatasetMappingDecorator):
                         cehrgpt_record,
                         code=f"i-H{datetime_cursor.hour}",
                         time=datetime_cursor,
+                        age=relativedelta(datetime_cursor, birth_datetime).years,
                     )
             # Keep track of the existing outpatient events, we don't want to add them again
@@ -281,6 +302,7 @@ class MedToCehrGPTDatasetMapping(DatasetMappingDecorator):
                             cehrgpt_record,
                             code=f"i-{self._inpatient_time_token_function(time_diff_days)}",
                             time=event_time,
+                            age=relativedelta(event_time, birth_datetime).years,
                         )
                     if self._include_inpatient_hour_token:
@@ -300,6 +322,7 @@ class MedToCehrGPTDatasetMapping(DatasetMappingDecorator):
                                 cehrgpt_record,
                                 code=f"i-H{time_diff_hours}",
                                 time=event_time,
+                                age=relativedelta(event_time, birth_datetime).years,
                             )
                 if event_identity in existing_duplicate_events:
@@ -309,6 +332,7 @@ class MedToCehrGPTDatasetMapping(DatasetMappingDecorator):
                     cehrgpt_record,
                     code=code,
                     time=event_time,
+                    age=relativedelta(event_time, birth_datetime).years,
                     concept_value_mask=concept_value_mask,
                     unit=unit,
                     number_as_value=numeric_value if numeric_value else 0.0,
@@ -348,6 +372,7 @@ class MedToCehrGPTDatasetMapping(DatasetMappingDecorator):
                         cehrgpt_record,
                         code=discharge_facility,
                         time=datetime_cursor,
+                        age=relativedelta(datetime_cursor, birth_datetime).years,
                     )
             # Reuse the age and date calculated for the last event in the patient timeline
@@ -355,6 +380,7 @@ class MedToCehrGPTDatasetMapping(DatasetMappingDecorator):
                 cehrgpt_record,
                 code="[VE]",
                 time=datetime_cursor,
+                age=relativedelta(datetime_cursor, birth_datetime).years,
             )
         # Generate the orders of the concepts that the cehrbert dataset mapping function expects
@@ -428,6 +454,13 @@ class HFCehrGptTokenizationMapping(DatasetMappingDecorator):
         return record
     def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
+        # Reconstruct the ages input before the filter is applied
+        record["ages"] = construct_age_sequence(
+            record["concept_ids"], record.get("ages", None)
+        )
+        record["epoch_times"] = construct_time_sequence(
+            record["concept_ids"], record.get("epoch_times", None)
+        )
         # Remove the tokens from patient sequences that do not exist in the tokenizer
         record = self.filter_out_invalid_tokens(record)
         # If any concept has a value associated with it, we normalize the value

cehrgpt/generation/cehrgpt_conditional_generation.py CHANGED Viewed

@@ -77,6 +77,7 @@ def generate_trajectories_per_batch(
     prediction_times = batch["index_date"].squeeze().detach().cpu().tolist()
     batched_epoch_times = batch["epoch_times"].detach().cpu().tolist()
     batched_input_ids = batch["input_ids"]
+    batched_ages = batch["ages"]
     batched_value_indicators = batch["value_indicators"]
     batched_values = batch["values"]
     # Make sure the batch does not exceed batch_size
@@ -84,6 +85,7 @@ def generate_trajectories_per_batch(
         cehrgpt_model,
         cehrgpt_tokenizer,
         batched_input_ids,
+        ages=batched_ages,
         values=batched_values,
         value_indicators=batched_value_indicators,
         max_length=max_length,

cehrgpt/generation/generate_batch_hf_gpt_sequence.py CHANGED Viewed

@@ -2,7 +2,7 @@ import datetime
 import os
 import random
 import uuid
-from typing import Any, Dict, List, Optional, Sequence, Tuple
+from typing import Any, Dict, Optional, Sequence, Tuple
 import numpy as np
 import pandas as pd
@@ -13,7 +13,7 @@ from transformers.utils import is_flash_attn_2_available, logging
 from cehrgpt.cehrgpt_args import create_inference_base_arg_parser
 from cehrgpt.generation.omop_converter_batch import START_TOKEN_SIZE
-from cehrgpt.gpt_utils import get_cehrgpt_output_folder
+from cehrgpt.gpt_utils import construct_age_sequence, get_cehrgpt_output_folder
 from cehrgpt.models.hf_cehrgpt import CEHRGPT2LMHeadModel
 from cehrgpt.models.special_tokens import END_TOKEN
 from cehrgpt.models.tokenization_hf_cehrgpt import (
@@ -72,9 +72,10 @@ def normalize_value(
 def generate_single_batch(
     model: CEHRGPT2LMHeadModel,
-    tokenizer: CehrGptTokenizer,
-    prompts: List[List[int]],
+    cehrgpt_tokenizer: CehrGptTokenizer,
+    prompts: torch.Tensor,
     max_length: int,
+    ages: Optional[torch.Tensor] = None,
     values: Optional[torch.Tensor] = None,
     value_indicators: Optional[torch.Tensor] = None,
     max_new_tokens: Optional[int] = None,
@@ -112,7 +113,9 @@ def generate_single_batch(
             epsilon_cutoff=epsilon_cutoff,
         )
-        batched_prompts = torch.tensor(prompts).to(device)
+        batched_prompts = prompts.to(device)
+        if ages is not None:
+            ages = ages.to(device)
         if values is not None:
             values = values.to(device)
         if value_indicators is not None:
@@ -120,19 +123,22 @@ def generate_single_batch(
         results = model.generate(
             inputs=batched_prompts,
+            ages=ages,
             values=values,
             value_indicators=value_indicators,
             generation_config=generation_config,
-            lab_token_ids=tokenizer.lab_token_ids,
+            cehrgpt_tokenizer=cehrgpt_tokenizer,
         )
     sequences = [
-        tokenizer.decode(seq.cpu().numpy(), skip_special_tokens=False)
+        cehrgpt_tokenizer.decode(seq.cpu().numpy(), skip_special_tokens=False)
         for seq in results.sequences
     ]
     if results.sequence_vals is not None:
         values = [
-            tokenizer.decode_value(values.cpu().numpy(), skip_special_tokens=False)
+            cehrgpt_tokenizer.decode_value(
+                values.cpu().numpy(), skip_special_tokens=False
+            )
             for values in results.sequence_vals
         ]
     else:
@@ -214,6 +220,7 @@ def main(args):
         # Randomly pick demographics from the existing population
         random_prompts = []
+        random_prompt_ages = []
         iter = 0
         while len(random_prompts) < args.batch_size:
             for row in dataset.select(
@@ -224,9 +231,9 @@ def main(args):
                     <= len(row["concept_ids"])
                     <= max_seq_allowed
                 ):
-                    random_prompts.append(
-                        cehrgpt_tokenizer.encode(row["concept_ids"][:prompt_size])
-                    )
+                    prompt = row["concept_ids"][:prompt_size]
+                    random_prompts.append(cehrgpt_tokenizer.encode(prompt))
+                    random_prompt_ages.append(construct_age_sequence(prompt))
                 iter += 1
                 if not random_prompts and iter > 10:
                     raise RuntimeError(
@@ -237,7 +244,8 @@ def main(args):
         batch_sequences = generate_single_batch(
             cehrgpt_model,
             cehrgpt_tokenizer,
-            random_prompts[: args.batch_size],
+            torch.tensor(random_prompts[: args.batch_size]),
+            ages=torch.tensor(random_prompt_ages[: args.batch_size]),
             max_length=args.context_window,
             mini_num_of_concepts=args.min_num_of_concepts,
             top_p=args.top_p,

cehrgpt/generation/omop_converter_batch.py CHANGED Viewed

@@ -270,20 +270,24 @@ def gpt_to_omop_converter_batch(
         is_numeric_types = (
             is_numeric_types[START_TOKEN_SIZE:]
-            if is_numeric_types is not None
+            if is_numeric_types is not None and not np.all(pd.isna(is_numeric_types))
             else None
         )
         number_as_values = (
             number_as_values[START_TOKEN_SIZE:]
-            if number_as_values is not None
+            if number_as_values is not None and not np.all(pd.isna(number_as_values))
             else None
         )
         concept_as_values = (
             concept_as_values[START_TOKEN_SIZE:]
-            if concept_as_values is not None
+            if concept_as_values is not None and not np.all(pd.isna(concept_as_values))
+            else None
+        )
+        units = (
+            units[START_TOKEN_SIZE:]
+            if units is not None and not np.all(pd.isna(units))
             else None
         )
-        units = units[START_TOKEN_SIZE:] if units is not None else None
         # TODO:Need to decode if the input is tokenized
         [start_year, start_age, start_gender, start_race] = concept_ids[
@@ -441,6 +445,9 @@ def gpt_to_omop_converter_batch(
             ]:
                 # If it's a start token, skip it
                 pass
+            elif event.endswith("/0"):
+                # This should capture the concept such as Visit/0, Discharge/0
+                pass
             else:
                 try:
                     concept_id = int(event)

cehrgpt/gpt_utils.py CHANGED Viewed

@@ -1,7 +1,12 @@
 import random
 import re
-from datetime import date, timedelta
-from typing import List, Sequence, Tuple
+from datetime import date, datetime, timedelta, timezone
+from typing import List, Optional, Sequence, Tuple, Union
+import numpy as np
+from cehrbert_data.const.artificial_tokens import DEATH_TOKEN
+from meds import death_code
+from transformers.utils import logging
 from cehrgpt.cehrgpt_args import SamplingStrategy
 from cehrgpt.models.special_tokens import (
@@ -14,6 +19,7 @@ from cehrgpt.models.special_tokens import (
 MEDS_CODE_PATTERN = re.compile(r".*/.*")
 INPATIENT_ATT_PATTERN = re.compile(r"(?:VS-|i-)D(\d+)(?:-VE)?")
 DEMOGRAPHIC_PROMPT_SIZE = 4
+logger = logging.get_logger("transformers")
 class RandomSampleCache:
@@ -62,6 +68,68 @@ class RandomSampleCache:
         return self._cache.pop()
+def construct_time_sequence(
+    concept_ids: List[str], epoch_times: Optional[List[Union[int, float]]] = None
+) -> List[float]:
+    if epoch_times is not None:
+        return epoch_times
+    if concept_ids[0].lower().startswith("year"):
+        year_str = concept_ids[0].split(":")[1]
+    else:
+        year_str = "1985"
+    datetime_cursor = datetime(
+        int(year_str), month=1, day=1, hour=0, minute=0, second=0
+    ).replace(tzinfo=timezone.utc)
+    epoch_times = []
+    for concept_id in concept_ids:
+        if is_att_token(concept_id):
+            att_days = extract_time_interval_in_days(concept_id)
+            datetime_cursor += timedelta(days=att_days)
+        epoch_times.append(datetime_cursor.timestamp())
+    return epoch_times
+def construct_age_sequence(
+    concept_ids: List[str], ages: Optional[List[int]] = None
+) -> List[int]:
+    if ages is not None:
+        return ages
+    elif concept_ids[1].lower().startswith("age"):
+        age_str = concept_ids[1].split(":")[1]
+        assert age_str.isnumeric(), f"age_str: {age_str}"
+        ages = []
+        time_delta = 0
+        for concept_id in concept_ids:
+            if is_att_token(concept_id):
+                time_delta += extract_time_interval_in_days(concept_id)
+            ages.append(int(age_str) + time_delta // 365)
+        return ages
+    else:
+        logger.warning(
+            "The second token is not a valid age token. The first 4 tokens are: %s. "
+            "Trying to fall back to ages, but it is not valid either %s. "
+            "Fall back to a zero vector [0, 0, 0, ...., 0]",
+            concept_ids[:4],
+            ages,
+        )
+        return np.zeros_like(concept_ids, dtype=int).tolist()
+def multiple_of_10(n: int) -> int:
+    return ((n // 10) + 1) * 10
+def encode_demographics(
+    age: int, gender: int, race: int, max_age=200, max_gender=10, max_race=10
+) -> int:
+    assert 0 <= age < max_age, f"age: {age}"
+    assert 0 <= gender < max_gender, f"gender: {gender}"
+    assert 0 <= race < max_race, f"race: {race}"
+    return age + max_age * gender + max_age * max_gender * race
 def collect_demographic_prompts_at_visits(patient_history: List[str]):
     demographic_prompts_at_visits = []
     start_year, start_age, start_gender, start_race = patient_history[
@@ -156,7 +224,7 @@ def random_slice_gpt_sequence(concept_ids, max_seq_len):
             )
         ):
             current_token = concept_ids[i]
-            if current_token == "VE":
+            if is_visit_end(current_token):
                 random_end_index = i
                 break
         return random_starting_index, random_end_index, demographic_tokens
@@ -198,6 +266,8 @@ def get_cehrgpt_output_folder(args, cehrgpt_tokenizer) -> str:
 def is_clinical_event(token: str, meds: bool = False) -> bool:
     if token.isnumeric():
         return True
+    if token in [DEATH_TOKEN, death_code]:
+        return True
     if meds:
         return bool(MEDS_CODE_PATTERN.match(token))
     return False

cehrgpt/models/activations.py ADDED Viewed

@@ -0,0 +1,27 @@
+# From https://github.com/bzhangGo/rmsnorm/blob/master/rmsnorm_torch.py
+# coding=utf-8
+from __future__ import absolute_import, division, print_function
+import torch
+import torch.nn as nn
+import transformers.pytorch_utils
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral
+class RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """MistralRMSNorm is equivalent to T5LayerNorm."""
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+transformers.pytorch_utils.ALL_LAYERNORM_LAYERS.extend([RMSNorm])

cehrgpt/models/config.py CHANGED Viewed

@@ -106,6 +106,8 @@ class CEHRGPTConfig(PretrainedConfig):
         n_head=12,
         n_inner=None,
         activation_function="gelu_new",
+        decoder_mlp="GPT2MLP",
+        mlp_bias=False,
         resid_pdrop=0.1,
         embd_pdrop=0.1,
         attn_pdrop=0.1,
@@ -124,7 +126,7 @@ class CEHRGPTConfig(PretrainedConfig):
         ve_token_id=None,
         scale_attn_by_inverse_layer_idx=False,
         reorder_and_upcast_attn=False,
-        exclude_position_ids=False,
+        apply_rotary=False,
         include_values=False,
         value_vocab_size=None,
         include_ttv_prediction=False,
@@ -169,6 +171,8 @@ class CEHRGPTConfig(PretrainedConfig):
         self.n_head = n_head
         self.n_inner = n_inner
         self.activation_function = activation_function
+        self.decoder_mlp = decoder_mlp
+        self.mlp_bias = mlp_bias
         self.resid_pdrop = resid_pdrop
         self.embd_pdrop = embd_pdrop
         self.attn_pdrop = attn_pdrop
@@ -188,7 +192,7 @@ class CEHRGPTConfig(PretrainedConfig):
         self.eos_token_id = eos_token_id
         self.lab_token_ids = lab_token_ids
-        self.exclude_position_ids = exclude_position_ids
+        self.apply_rotary = apply_rotary
         self.include_values = include_values
         self.value_vocab_size = value_vocab_size

cehrgpt 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl

cehrgpt 0.1.2py3-none-any.whl → 0.1.3py3-none-any.whl