PyPI - cehrgpt - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl - Mend

cehrgpt 0.1.1py3-none-any.whl → 0.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

cehrgpt/analysis/htn_treatment_pathway.py +546 -0
cehrgpt/analysis/treatment_pathway/__init__.py +0 -0
cehrgpt/analysis/treatment_pathway/depression_treatment_pathway.py +94 -0
cehrgpt/analysis/treatment_pathway/diabetes_treatment_pathway.py +94 -0
cehrgpt/analysis/treatment_pathway/htn_treatment_pathway.py +94 -0
cehrgpt/analysis/treatment_pathway/treatment_pathway.py +631 -0
cehrgpt/data/cehrgpt_data_processor.py +549 -0
cehrgpt/data/hf_cehrgpt_dataset.py +4 -0
cehrgpt/data/hf_cehrgpt_dataset_collator.py +286 -629
cehrgpt/data/hf_cehrgpt_dataset_mapping.py +60 -14
cehrgpt/generation/cehrgpt_conditional_generation.py +316 -0
cehrgpt/generation/generate_batch_hf_gpt_sequence.py +35 -15
cehrgpt/generation/omop_converter_batch.py +11 -4
cehrgpt/gpt_utils.py +73 -3
cehrgpt/models/activations.py +27 -0
cehrgpt/models/config.py +6 -2
cehrgpt/models/gpt2.py +560 -0
cehrgpt/models/hf_cehrgpt.py +193 -459
cehrgpt/models/tokenization_hf_cehrgpt.py +380 -50
cehrgpt/omop/ontology.py +154 -0
cehrgpt/runners/data_utils.py +17 -6
cehrgpt/runners/hf_cehrgpt_finetune_runner.py +33 -79
cehrgpt/runners/hf_cehrgpt_pretrain_runner.py +48 -44
cehrgpt/runners/hf_gpt_runner_argument_dataclass.py +58 -34
cehrgpt/runners/hyperparameter_search_util.py +180 -69
cehrgpt/runners/sample_packing_trainer.py +11 -2
cehrgpt/tools/linear_prob/compute_cehrgpt_features.py +27 -31
cehrgpt-0.1.3.dist-info/METADATA +238 -0
{cehrgpt-0.1.1.dist-info → cehrgpt-0.1.3.dist-info}/RECORD +33 -22
cehrgpt-0.1.1.dist-info/METADATA +0 -115
/cehrgpt/tools/{merge_synthetic_real_dataasets.py → merge_synthetic_real_datasets.py} +0 -0
{cehrgpt-0.1.1.dist-info → cehrgpt-0.1.3.dist-info}/WHEEL +0 -0
{cehrgpt-0.1.1.dist-info → cehrgpt-0.1.3.dist-info}/licenses/LICENSE +0 -0
{cehrgpt-0.1.1.dist-info → cehrgpt-0.1.3.dist-info}/top_level.txt +0 -0

cehrgpt/data/hf_cehrgpt_dataset_mapping.py CHANGED Viewed

@@ -21,7 +21,6 @@ from cehrbert_data.const.artificial_tokens import (
     DISCHARGE_UNKNOWN_TOKEN,
     GENDER_UNKNOWN_TOKEN,
     RACE_UNKNOWN_TOKEN,
-    VISIT_UNKNOWN_TOKEN,
 )
 from cehrbert_data.const.common import NA
 from cehrbert_data.decorators.patient_event_decorator_base import get_att_function
@@ -29,6 +28,12 @@ from datasets.formatting.formatting import LazyBatch
 from dateutil.relativedelta import relativedelta
 from pandas import Series
+from cehrgpt.gpt_utils import (
+    construct_age_sequence,
+    construct_time_sequence,
+    encode_demographics,
+    multiple_of_10,
+)
 from cehrgpt.models.tokenization_hf_cehrgpt import (
     NONE_BIN,
     UNKNOWN_BIN,
@@ -44,13 +49,20 @@ CEHRGPT_COLUMNS = [
     "concept_values",
     "units",
     "epoch_times",
+    "ages",
 ]
-def convert_date_to_posix_time(index_date: datetime.date) -> float:
-    return datetime.datetime.combine(
-        index_date, datetime.datetime.min.time()
-    ).timestamp()
+def convert_date_to_posix_time(index_date: Union[datetime.date, int, float]) -> float:
+    if isinstance(index_date, datetime.date):
+        return (
+            datetime.datetime.combine(index_date, datetime.datetime.min.time())
+            .replace(tzinfo=datetime.timezone.utc)
+            .timestamp()
+        )
+    elif isinstance(index_date, datetime.datetime):
+        return index_date.replace(tzinfo=datetime.timezone.utc).timestamp()
+    return index_date
 class DatasetMappingDecorator(DatasetMapping):
@@ -116,6 +128,7 @@ class MedToCehrGPTDatasetMapping(DatasetMappingDecorator):
         cehrgpt_record: Dict[str, Any],
         code: str,
         time: datetime.datetime,
+        age: int,
         concept_value_mask: int = 0,
         number_as_value: float = 0.0,
         concept_as_value: str = "0",
@@ -123,17 +136,21 @@ class MedToCehrGPTDatasetMapping(DatasetMappingDecorator):
         unit: str = NA,
     ) -> None:
         cehrgpt_record["concept_ids"].append(replace_escape_chars(code))
+        cehrgpt_record["ages"].append(age)
         cehrgpt_record["concept_value_masks"].append(concept_value_mask)
         cehrgpt_record["number_as_values"].append(number_as_value)
         cehrgpt_record["concept_as_values"].append(concept_as_value)
         cehrgpt_record["units"].append(unit)
         cehrgpt_record["is_numeric_types"].append(is_numeric_type)
-        cehrgpt_record["epoch_times"].append(time.timestamp())
+        cehrgpt_record["epoch_times"].append(
+            time.replace(tzinfo=datetime.timezone.utc).timestamp()
+        )
     def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
         cehrgpt_record = {
             "person_id": record["patient_id"],
             "concept_ids": [],
+            "ages": [],
             "concept_value_masks": [],
             "number_as_values": [],
             "concept_as_values": [],
@@ -161,14 +178,21 @@ class MedToCehrGPTDatasetMapping(DatasetMappingDecorator):
         first_visit_start_datetime: datetime.datetime = get_value(
             first_visit, "visit_start_datetime"
         )
+        starting_age = relativedelta(first_visit_start_datetime, birth_datetime).years
         year_str = f"year:{str(first_visit_start_datetime.year)}"
-        age_str = f"age:{str(relativedelta(first_visit_start_datetime, birth_datetime).years)}"
+        age_str = f"age:{starting_age}"
+        self._update_cehrgpt_record(
+            cehrgpt_record, year_str, first_visit_start_datetime, starting_age
+        )
+        self._update_cehrgpt_record(
+            cehrgpt_record, age_str, first_visit_start_datetime, starting_age
+        )
+        self._update_cehrgpt_record(
+            cehrgpt_record, gender, first_visit_start_datetime, starting_age
+        )
         self._update_cehrgpt_record(
-            cehrgpt_record, year_str, first_visit_start_datetime
+            cehrgpt_record, race, first_visit_start_datetime, starting_age
         )
-        self._update_cehrgpt_record(cehrgpt_record, age_str, first_visit_start_datetime)
-        self._update_cehrgpt_record(cehrgpt_record, gender, first_visit_start_datetime)
-        self._update_cehrgpt_record(cehrgpt_record, race, first_visit_start_datetime)
         # Use a data cursor to keep track of time
         datetime_cursor: Optional[datetime.datetime] = None
@@ -204,6 +228,7 @@ class MedToCehrGPTDatasetMapping(DatasetMappingDecorator):
                     cehrgpt_record,
                     code=self._time_token_function(time_delta),
                     time=visit_start_datetime,
+                    age=relativedelta(datetime_cursor, birth_datetime).years,
                 )
             datetime_cursor = visit_start_datetime
@@ -212,12 +237,14 @@ class MedToCehrGPTDatasetMapping(DatasetMappingDecorator):
                 cehrgpt_record,
                 code="[VS]",
                 time=datetime_cursor,
+                age=relativedelta(datetime_cursor, birth_datetime).years,
             )
             # Add a visit type token
             self._update_cehrgpt_record(
                 cehrgpt_record,
                 code=visit_type,
                 time=datetime_cursor,
+                age=relativedelta(datetime_cursor, birth_datetime).years,
             )
             # We need to insert an inpatient hour token right after the visit type, we calculate the hour interval
             # with respect to the midnight of the day
@@ -228,6 +255,7 @@ class MedToCehrGPTDatasetMapping(DatasetMappingDecorator):
                         cehrgpt_record,
                         code=f"i-H{datetime_cursor.hour}",
                         time=datetime_cursor,
+                        age=relativedelta(datetime_cursor, birth_datetime).years,
                     )
             # Keep track of the existing outpatient events, we don't want to add them again
@@ -274,6 +302,7 @@ class MedToCehrGPTDatasetMapping(DatasetMappingDecorator):
                             cehrgpt_record,
                             code=f"i-{self._inpatient_time_token_function(time_diff_days)}",
                             time=event_time,
+                            age=relativedelta(event_time, birth_datetime).years,
                         )
                     if self._include_inpatient_hour_token:
@@ -293,6 +322,7 @@ class MedToCehrGPTDatasetMapping(DatasetMappingDecorator):
                                 cehrgpt_record,
                                 code=f"i-H{time_diff_hours}",
                                 time=event_time,
+                                age=relativedelta(event_time, birth_datetime).years,
                             )
                 if event_identity in existing_duplicate_events:
@@ -302,6 +332,7 @@ class MedToCehrGPTDatasetMapping(DatasetMappingDecorator):
                     cehrgpt_record,
                     code=code,
                     time=event_time,
+                    age=relativedelta(event_time, birth_datetime).years,
                     concept_value_mask=concept_value_mask,
                     unit=unit,
                     number_as_value=numeric_value if numeric_value else 0.0,
@@ -341,6 +372,7 @@ class MedToCehrGPTDatasetMapping(DatasetMappingDecorator):
                         cehrgpt_record,
                         code=discharge_facility,
                         time=datetime_cursor,
+                        age=relativedelta(datetime_cursor, birth_datetime).years,
                     )
             # Reuse the age and date calculated for the last event in the patient timeline
@@ -348,6 +380,7 @@ class MedToCehrGPTDatasetMapping(DatasetMappingDecorator):
                 cehrgpt_record,
                 code="[VE]",
                 time=datetime_cursor,
+                age=relativedelta(datetime_cursor, birth_datetime).years,
             )
         # Generate the orders of the concepts that the cehrbert dataset mapping function expects
@@ -360,7 +393,9 @@ class MedToCehrGPTDatasetMapping(DatasetMappingDecorator):
         cehrgpt_record["num_of_visits"] = len(visits)
         if record.get("index_date", None) is not None:
-            cehrgpt_record["index_date"] = record["index_date"]
+            cehrgpt_record["index_date"] = (
+                record["index_date"].replace(tzinfo=datetime.timezone.utc).timestamp()
+            )
         if record.get("label", None) is not None:
             cehrgpt_record["label"] = record["label"]
         if record.get("age_at_index", None) is not None:
@@ -419,6 +454,13 @@ class HFCehrGptTokenizationMapping(DatasetMappingDecorator):
         return record
     def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
+        # Reconstruct the ages input before the filter is applied
+        record["ages"] = construct_age_sequence(
+            record["concept_ids"], record.get("ages", None)
+        )
+        record["epoch_times"] = construct_time_sequence(
+            record["concept_ids"], record.get("epoch_times", None)
+        )
         # Remove the tokens from patient sequences that do not exist in the tokenizer
         record = self.filter_out_invalid_tokens(record)
         # If any concept has a value associated with it, we normalize the value
@@ -529,9 +571,13 @@ class ExtractTokenizedSequenceDataMapping:
         prediction_start_end_times = [
             (
                 self._calculate_prediction_start_time(
-                    prediction_time_label_map["index_date"].timestamp()
+                    prediction_time_label_map["index_date"]
+                    .replace(tzinfo=datetime.timezone.utc)
+                    .timestamp()
                 ),
-                prediction_time_label_map["index_date"].timestamp(),
+                prediction_time_label_map["index_date"]
+                .replace(tzinfo=datetime.timezone.utc)
+                .timestamp(),
                 prediction_time_label_map["label"],
             )
             for prediction_time_label_map in prediction_times

cehrgpt/generation/cehrgpt_conditional_generation.py ADDED Viewed

@@ -0,0 +1,316 @@
+import datetime
+import os
+import random
+import shutil
+from pathlib import Path
+from typing import Any, Dict
+import numpy as np
+import polars as pl
+import torch
+import torch.distributed as dist
+from cehrbert.runners.runner_util import generate_prepared_ds_path
+from datasets import load_from_disk
+from meds import held_out_split, train_split, tuning_split
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers.trainer_utils import is_main_process
+from transformers.utils import is_flash_attn_2_available, logging
+from cehrgpt.data.hf_cehrgpt_dataset import create_cehrgpt_finetuning_dataset
+from cehrgpt.data.hf_cehrgpt_dataset_collator import CehrGptDataCollator
+from cehrgpt.generation.generate_batch_hf_gpt_sequence import (
+    generate_single_batch,
+    normalize_value,
+)
+from cehrgpt.gpt_utils import (
+    extract_time_interval_in_days,
+    extract_time_interval_in_hours,
+    is_att_token,
+    is_inpatient_hour_token,
+    is_visit_end,
+    is_visit_start,
+)
+from cehrgpt.models.hf_cehrgpt import CEHRGPT2LMHeadModel
+from cehrgpt.models.tokenization_hf_cehrgpt import CehrGptTokenizer
+from cehrgpt.runners.data_utils import (
+    extract_cohort_sequences,
+    prepare_finetune_dataset,
+)
+from cehrgpt.runners.gpt_runner_util import parse_runner_args
+from cehrgpt.runners.hf_cehrgpt_pretrain_runner import tokenizer_exists
+LOG = logging.get_logger("transformers")
+def map_data_split_name(split: str) -> str:
+    if split == "train":
+        return train_split
+    elif split == "validation":
+        return tuning_split
+    elif split == "test":
+        return held_out_split
+    raise ValueError(f"Unknown split: {split}")
+def seed_all(seed: int = 42):
+    """Set seed for Python, NumPy, and PyTorch (CPU & CUDA)."""
+    random.seed(seed)  # Python random
+    np.random.seed(seed)  # NumPy
+    torch.manual_seed(seed)  # PyTorch CPU
+    torch.cuda.manual_seed(seed)  # Current GPU
+    torch.cuda.manual_seed_all(seed)  # All GPUs
+    # For reproducibility in dataloader workers
+    os.environ["PYTHONHASHSEED"] = str(seed)
+def generate_trajectories_per_batch(
+    batch: Dict[str, Any],
+    cehrgpt_tokenizer: CehrGptTokenizer,
+    cehrgpt_model: CEHRGPT2LMHeadModel,
+    device,
+    data_output_path: Path,
+    max_length: int,
+):
+    subject_ids = batch["person_id"].squeeze().detach().cpu().tolist()
+    prediction_times = batch["index_date"].squeeze().detach().cpu().tolist()
+    batched_epoch_times = batch["epoch_times"].detach().cpu().tolist()
+    batched_input_ids = batch["input_ids"]
+    batched_ages = batch["ages"]
+    batched_value_indicators = batch["value_indicators"]
+    batched_values = batch["values"]
+    # Make sure the batch does not exceed batch_size
+    batch_sequences = generate_single_batch(
+        cehrgpt_model,
+        cehrgpt_tokenizer,
+        batched_input_ids,
+        ages=batched_ages,
+        values=batched_values,
+        value_indicators=batched_value_indicators,
+        max_length=max_length,
+        top_p=1.0,
+        top_k=cehrgpt_tokenizer.vocab_size,
+        device=device,
+    )
+    # Clear the cache
+    torch.cuda.empty_cache()
+    trajectories = []
+    for sample_i, (concept_ids, value_indicators, values) in enumerate(
+        zip(
+            batch_sequences["sequences"],
+            batch_sequences["value_indicators"],
+            batch_sequences["values"],
+        )
+    ):
+        (
+            concept_ids,
+            is_numeric_types,
+            number_as_values,
+            concept_as_values,
+            units,
+        ) = normalize_value(concept_ids, values, cehrgpt_tokenizer)
+        epoch_times = batched_epoch_times[sample_i]
+        input_length = len(epoch_times)
+        # Getting the last observed event time from the token before the prediction time
+        window_last_observed = epoch_times[input_length - 1]
+        current_cursor = epoch_times[-1]
+        generated_epoch_times = []
+        valid_indices = []
+        for i in range(input_length, len(concept_ids)):
+            concept_id = concept_ids[i]
+            # We use the left padding strategy in the data collator
+            if concept_id in [cehrgpt_tokenizer.pad_token, cehrgpt_tokenizer.end_token]:
+                continue
+            # We need to construct the time stamp
+            if is_att_token(concept_id):
+                current_cursor += extract_time_interval_in_days(concept_id) * 24 * 3600
+            elif is_inpatient_hour_token(concept_id):
+                current_cursor += extract_time_interval_in_hours(concept_id) * 3600
+            elif is_visit_start(concept_id) or is_visit_end(concept_id):
+                continue
+            else:
+                valid_indices.append(i)
+                generated_epoch_times.append(
+                    datetime.datetime.utcfromtimestamp(current_cursor).replace(
+                        tzinfo=None
+                    )
+                )
+        trajectories.append(
+            {
+                "subject_id": subject_ids[sample_i],
+                "prediction_time": datetime.datetime.utcfromtimestamp(
+                    prediction_times[sample_i]
+                ).replace(tzinfo=None),
+                "window_last_observed_time": datetime.datetime.utcfromtimestamp(
+                    window_last_observed
+                ).replace(tzinfo=None),
+                "times": generated_epoch_times,
+                "concept_ids": np.asarray(concept_ids)[valid_indices].tolist(),
+                "numeric_values": np.asarray(number_as_values)[valid_indices].tolist(),
+                "text_value": np.asarray(concept_as_values)[valid_indices].tolist(),
+                "units": np.asarray(units)[valid_indices].tolist(),
+            }
+        )
+    trajectories = (
+        pl.DataFrame(trajectories)
+        .explode(["times", "concept_ids", "numeric_values", "text_value", "units"])
+        .rename(
+            {
+                "times": "time",
+                "concept_ids": "code",
+                "numeric_values": "numeric_value",
+                "units": "unit",
+            }
+        )
+        .select(
+            "subject_id",
+            "prediction_time",
+            "window_last_observed_time",
+            "time",
+            "code",
+            "numeric_value",
+            "text_value",
+            "unit",
+        )
+    )
+    trajectories.write_parquet(data_output_path)
+def main():
+    cehrgpt_args, data_args, model_args, training_args = parse_runner_args()
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+    else:
+        device = torch.device("cpu")
+    cehrgpt_tokenizer = CehrGptTokenizer.from_pretrained(
+        model_args.tokenizer_name_or_path
+    )
+    cehrgpt_model = (
+        CEHRGPT2LMHeadModel.from_pretrained(
+            model_args.model_name_or_path,
+            attn_implementation=(
+                "flash_attention_2" if is_flash_attn_2_available() else "eager"
+            ),
+        )
+        .eval()
+        .to(device)
+    )
+    cehrgpt_model.generation_config.pad_token_id = cehrgpt_tokenizer.pad_token_id
+    cehrgpt_model.generation_config.eos_token_id = cehrgpt_tokenizer.end_token_id
+    cehrgpt_model.generation_config.bos_token_id = cehrgpt_tokenizer.end_token_id
+    if not os.path.exists(training_args.output_dir):
+        os.makedirs(training_args.output_dir)
+    prepared_ds_path = generate_prepared_ds_path(
+        data_args, model_args, data_folder=data_args.cohort_folder
+    )
+    processed_dataset = None
+    if any(prepared_ds_path.glob("*")):
+        LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
+        processed_dataset = load_from_disk(str(prepared_ds_path))
+        LOG.info("Prepared dataset loaded from disk...")
+        if cehrgpt_args.expand_tokenizer:
+            if tokenizer_exists(training_args.output_dir):
+                cehrgpt_tokenizer = CehrGptTokenizer.from_pretrained(
+                    training_args.output_dir
+                )
+            else:
+                LOG.warning(
+                    f"CehrGptTokenizer must exist in {training_args.output_dir} "
+                    f"when the dataset has been processed and expand_tokenizer is set to True. "
+                    f"Please delete the processed dataset at {prepared_ds_path}."
+                )
+                processed_dataset = None
+                shutil.rmtree(prepared_ds_path)
+    if processed_dataset is None and is_main_process(training_args.local_rank):
+        # If the full dataset has been tokenized, we don't want to tokenize the cohort containing
+        # the subset of the data. We should slice out the portion of the tokenized sequences for each sample
+        if cehrgpt_args.tokenized_full_dataset_path is not None:
+            processed_dataset = extract_cohort_sequences(data_args, cehrgpt_args)
+        else:
+            # Organize them into a single DatasetDict
+            final_splits = prepare_finetune_dataset(
+                data_args, training_args, cehrgpt_args
+            )
+            # TODO: temp solution, this column is mixed typed and causes an issue when transforming the data
+            if not data_args.streaming:
+                all_columns = final_splits["train"].column_names
+                if "visit_concept_ids" in all_columns:
+                    final_splits = final_splits.remove_columns(["visit_concept_ids"])
+            processed_dataset = create_cehrgpt_finetuning_dataset(
+                dataset=final_splits,
+                cehrgpt_tokenizer=cehrgpt_tokenizer,
+                data_args=data_args,
+            )
+        if not data_args.streaming:
+            processed_dataset.save_to_disk(prepared_ds_path)
+            processed_dataset.cleanup_cache_files()
+    # After main-process-only operations, synchronize all processes to ensure consistency
+    if dist.is_available() and dist.is_initialized():
+        dist.barrier()
+    # We suppress the additional learning objectives in fine-tuning
+    data_collator = CehrGptDataCollator(
+        tokenizer=cehrgpt_tokenizer,
+        max_length=cehrgpt_args.generation_input_length,
+        include_values=cehrgpt_model.config.include_values,
+        pretraining=False,
+        include_ttv_prediction=False,
+        use_sub_time_tokenization=False,
+        include_demographics=False,
+        add_linear_prob_token=False,
+    )
+    LOG.info(
+        "Generating %s trajectories per sample",
+        cehrgpt_args.num_of_trajectories_per_sample,
+    )
+    for sample_i in range(cehrgpt_args.num_of_trajectories_per_sample):
+        for split, dataset in processed_dataset.items():
+            meds_split = map_data_split_name(split)
+            dataloader = DataLoader(
+                dataset=dataset,
+                batch_size=training_args.per_device_eval_batch_size,
+                num_workers=training_args.dataloader_num_workers,
+                collate_fn=data_collator,
+                pin_memory=training_args.dataloader_pin_memory,
+            )
+            sample_output_dir = (
+                Path(training_args.output_dir) / meds_split / f"{sample_i}"
+            )
+            sample_output_dir.mkdir(exist_ok=True, parents=True)
+            for batch_i, batch in tqdm(
+                enumerate(dataloader),
+                desc=f"Generating Trajectories for split {meds_split} with trajectory {sample_i + 1}",
+            ):
+                output_parquet_file = sample_output_dir / f"{batch_i}.parquet"
+                if output_parquet_file.exists():
+                    LOG.info("%s already exists, skip...", output_parquet_file)
+                    continue
+                generate_trajectories_per_batch(
+                    batch,
+                    cehrgpt_tokenizer,
+                    cehrgpt_model,
+                    device,
+                    sample_output_dir / f"{batch_i}.parquet",
+                    cehrgpt_args.generation_max_new_tokens
+                    + cehrgpt_args.generation_input_length,
+                )
+if __name__ == "__main__":
+    # ✅ Call first thing inside main()
+    seed_all(42)
+    main()

cehrgpt/generation/generate_batch_hf_gpt_sequence.py CHANGED Viewed

@@ -2,7 +2,7 @@ import datetime
 import os
 import random
 import uuid
-from typing import Any, Dict, List, Optional, Sequence, Tuple
+from typing import Any, Dict, Optional, Sequence, Tuple
 import numpy as np
 import pandas as pd
@@ -13,7 +13,7 @@ from transformers.utils import is_flash_attn_2_available, logging
 from cehrgpt.cehrgpt_args import create_inference_base_arg_parser
 from cehrgpt.generation.omop_converter_batch import START_TOKEN_SIZE
-from cehrgpt.gpt_utils import get_cehrgpt_output_folder
+from cehrgpt.gpt_utils import construct_age_sequence, get_cehrgpt_output_folder
 from cehrgpt.models.hf_cehrgpt import CEHRGPT2LMHeadModel
 from cehrgpt.models.special_tokens import END_TOKEN
 from cehrgpt.models.tokenization_hf_cehrgpt import (
@@ -72,9 +72,13 @@ def normalize_value(
 def generate_single_batch(
     model: CEHRGPT2LMHeadModel,
-    tokenizer: CehrGptTokenizer,
-    prompts: List[List[int]],
-    max_new_tokens=512,
+    cehrgpt_tokenizer: CehrGptTokenizer,
+    prompts: torch.Tensor,
+    max_length: int,
+    ages: Optional[torch.Tensor] = None,
+    values: Optional[torch.Tensor] = None,
+    value_indicators: Optional[torch.Tensor] = None,
+    max_new_tokens: Optional[int] = None,
     mini_num_of_concepts=1,
     top_p=0.95,
     top_k=50,
@@ -88,7 +92,8 @@ def generate_single_batch(
     with torch.no_grad():
         generation_config = GenerationConfig(
             repetition_penalty=repetition_penalty,
-            max_length=max_new_tokens,
+            max_new_tokens=max_new_tokens,
+            max_length=max_length,
             min_length=mini_num_of_concepts,
             temperature=temperature,
             top_p=top_p,
@@ -107,20 +112,33 @@ def generate_single_batch(
             num_beam_groups=num_beam_groups,
             epsilon_cutoff=epsilon_cutoff,
         )
-        batched_prompts = torch.tensor(prompts).to(device)
+        batched_prompts = prompts.to(device)
+        if ages is not None:
+            ages = ages.to(device)
+        if values is not None:
+            values = values.to(device)
+        if value_indicators is not None:
+            value_indicators = value_indicators.to(device)
         results = model.generate(
             inputs=batched_prompts,
+            ages=ages,
+            values=values,
+            value_indicators=value_indicators,
             generation_config=generation_config,
-            lab_token_ids=tokenizer.lab_token_ids,
+            cehrgpt_tokenizer=cehrgpt_tokenizer,
         )
     sequences = [
-        tokenizer.decode(seq.cpu().numpy(), skip_special_tokens=False)
+        cehrgpt_tokenizer.decode(seq.cpu().numpy(), skip_special_tokens=False)
         for seq in results.sequences
     ]
     if results.sequence_vals is not None:
         values = [
-            tokenizer.decode_value(values.cpu().numpy(), skip_special_tokens=False)
+            cehrgpt_tokenizer.decode_value(
+                values.cpu().numpy(), skip_special_tokens=False
+            )
             for values in results.sequence_vals
         ]
     else:
@@ -202,6 +220,7 @@ def main(args):
         # Randomly pick demographics from the existing population
         random_prompts = []
+        random_prompt_ages = []
         iter = 0
         while len(random_prompts) < args.batch_size:
             for row in dataset.select(
@@ -212,9 +231,9 @@ def main(args):
                     <= len(row["concept_ids"])
                     <= max_seq_allowed
                 ):
-                    random_prompts.append(
-                        cehrgpt_tokenizer.encode(row["concept_ids"][:prompt_size])
-                    )
+                    prompt = row["concept_ids"][:prompt_size]
+                    random_prompts.append(cehrgpt_tokenizer.encode(prompt))
+                    random_prompt_ages.append(construct_age_sequence(prompt))
                 iter += 1
                 if not random_prompts and iter > 10:
                     raise RuntimeError(
@@ -225,8 +244,9 @@ def main(args):
         batch_sequences = generate_single_batch(
             cehrgpt_model,
             cehrgpt_tokenizer,
-            random_prompts[: args.batch_size],
-            max_new_tokens=args.context_window,
+            torch.tensor(random_prompts[: args.batch_size]),
+            ages=torch.tensor(random_prompt_ages[: args.batch_size]),
+            max_length=args.context_window,
             mini_num_of_concepts=args.min_num_of_concepts,
             top_p=args.top_p,
             top_k=args.top_k,

cehrgpt/generation/omop_converter_batch.py CHANGED Viewed

@@ -270,20 +270,24 @@ def gpt_to_omop_converter_batch(
         is_numeric_types = (
             is_numeric_types[START_TOKEN_SIZE:]
-            if is_numeric_types is not None
+            if is_numeric_types is not None and not np.all(pd.isna(is_numeric_types))
             else None
         )
         number_as_values = (
             number_as_values[START_TOKEN_SIZE:]
-            if number_as_values is not None
+            if number_as_values is not None and not np.all(pd.isna(number_as_values))
             else None
         )
         concept_as_values = (
             concept_as_values[START_TOKEN_SIZE:]
-            if concept_as_values is not None
+            if concept_as_values is not None and not np.all(pd.isna(concept_as_values))
+            else None
+        )
+        units = (
+            units[START_TOKEN_SIZE:]
+            if units is not None and not np.all(pd.isna(units))
             else None
         )
-        units = units[START_TOKEN_SIZE:] if units is not None else None
         # TODO:Need to decode if the input is tokenized
         [start_year, start_age, start_gender, start_race] = concept_ids[
@@ -441,6 +445,9 @@ def gpt_to_omop_converter_batch(
             ]:
                 # If it's a start token, skip it
                 pass
+            elif event.endswith("/0"):
+                # This should capture the concept such as Visit/0, Discharge/0
+                pass
             else:
                 try:
                     concept_id = int(event)

cehrgpt 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

cehrgpt 0.1.1py3-none-any.whl → 0.1.3py3-none-any.whl