PyPI - cehrgpt - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl - Mend

cehrgpt 0.1.0py3-none-any.whl → 0.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

cehrgpt/analysis/irregularity.py +36 -0
cehrgpt/data/hf_cehrgpt_dataset.py +1 -0
cehrgpt/data/hf_cehrgpt_dataset_collator.py +454 -68
cehrgpt/data/hf_cehrgpt_dataset_mapping.py +232 -17
cehrgpt/data/sample_packing_sampler.py +36 -6
cehrgpt/generation/cehrgpt_conditional_generation.py +314 -0
cehrgpt/generation/generate_batch_hf_gpt_sequence.py +15 -3
cehrgpt/generation/omop_converter_batch.py +32 -2
cehrgpt/gpt_utils.py +20 -2
cehrgpt/models/config.py +25 -0
cehrgpt/models/hf_cehrgpt.py +244 -39
cehrgpt/models/hf_modeling_outputs.py +1 -0
cehrgpt/models/special_tokens.py +1 -0
cehrgpt/models/tokenization_hf_cehrgpt.py +354 -71
cehrgpt/runners/data_utils.py +131 -5
cehrgpt/runners/hf_cehrgpt_finetune_runner.py +84 -51
cehrgpt/runners/hf_cehrgpt_pretrain_runner.py +59 -7
cehrgpt/runners/hf_gpt_runner_argument_dataclass.py +60 -0
cehrgpt/runners/hyperparameter_search_util.py +6 -7
cehrgpt/runners/sample_packing_trainer.py +17 -0
cehrgpt/time_to_event/config/1_year_cabg.yaml +23 -0
cehrgpt/time_to_event/time_to_event_model.py +2 -13
cehrgpt/time_to_event/time_to_event_prediction.py +27 -13
cehrgpt/tools/linear_prob/compute_cehrgpt_features.py +80 -62
{cehrgpt-0.1.0.dist-info → cehrgpt-0.1.2.dist-info}/METADATA +102 -7
{cehrgpt-0.1.0.dist-info → cehrgpt-0.1.2.dist-info}/RECORD +29 -26
{cehrgpt-0.1.0.dist-info → cehrgpt-0.1.2.dist-info}/WHEEL +1 -1
{cehrgpt-0.1.0.dist-info → cehrgpt-0.1.2.dist-info}/licenses/LICENSE +0 -0
{cehrgpt-0.1.0.dist-info → cehrgpt-0.1.2.dist-info}/top_level.txt +0 -0

cehrgpt/data/hf_cehrgpt_dataset_mapping.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import datetime
-from typing import Any, Dict, Generator, Optional
+from collections import defaultdict
+from typing import Any, Dict, Generator, List, Optional, Union
 import numpy as np
 import pandas as pd
+from cehrbert.data_generators.hf_data_generator import UNKNOWN_VALUE
 from cehrbert.data_generators.hf_data_generator.hf_dataset_mapping import (
     ED_VISIT_TYPE_CODES,
     INPATIENT_VISIT_TYPE_CODES,
@@ -15,9 +17,16 @@ from cehrbert.data_generators.hf_data_generator.hf_dataset_mapping import (
 )
 from cehrbert.med_extension.schema_extension import Event
 from cehrbert.runners.hf_runner_argument_dataclass import DataTrainingArguments
+from cehrbert_data.const.artificial_tokens import (
+    DISCHARGE_UNKNOWN_TOKEN,
+    GENDER_UNKNOWN_TOKEN,
+    RACE_UNKNOWN_TOKEN,
+)
 from cehrbert_data.const.common import NA
 from cehrbert_data.decorators.patient_event_decorator_base import get_att_function
+from datasets.formatting.formatting import LazyBatch
 from dateutil.relativedelta import relativedelta
+from pandas import Series
 from cehrgpt.models.tokenization_hf_cehrgpt import (
     NONE_BIN,
@@ -25,14 +34,60 @@ from cehrgpt.models.tokenization_hf_cehrgpt import (
     CehrGptTokenizer,
 )
+CEHRGPT_COLUMNS = [
+    "concept_ids",
+    "concept_value_masks",
+    "number_as_values",
+    "concept_as_values",
+    "is_numeric_types",
+    "concept_values",
+    "units",
+    "epoch_times",
+]
+def convert_date_to_posix_time(index_date: Union[datetime.date, int, float]) -> float:
+    if isinstance(index_date, datetime.date):
+        return (
+            datetime.datetime.combine(index_date, datetime.datetime.min.time())
+            .replace(tzinfo=datetime.timezone.utc)
+            .timestamp()
+        )
+    elif isinstance(index_date, datetime.datetime):
+        return index_date.replace(tzinfo=datetime.timezone.utc).timestamp()
+    return index_date
+class DatasetMappingDecorator(DatasetMapping):
+    def batch_transform(
+        self, records: Union[LazyBatch, Dict[str, Any]]
+    ) -> List[Dict[str, Any]]:
+        """
+        Drop index_date if it contains None.
+        :param records:
+        :return:
+        """
+        if isinstance(records, LazyBatch):
+            table = records.pa_table
+            if "index_date" in table.column_names:
+                index_col = table.column("index_date")
+                if index_col.null_count > 0:
+                    table = table.drop(["index_date"])
+            records = LazyBatch(pa_table=table, formatter=records.formatter)
+        else:
+            if "index_date" in records:
+                if pd.isna(records["index_date"][0]):
+                    del records["index_date"]
+        return super().batch_transform(records=records)
-def convert_date_to_posix_time(index_date: datetime.date) -> float:
-    return datetime.datetime.combine(
-        index_date, datetime.datetime.min.time()
-    ).timestamp()
+    def transform(self, record: Dict[str, Any]) -> Union[Dict[str, Any], Series]:
+        raise NotImplemented("Must be implemented")
-class MedToCehrGPTDatasetMapping(DatasetMapping):
+class MedToCehrGPTDatasetMapping(DatasetMappingDecorator):
     def __init__(
         self,
         data_args: DataTrainingArguments,
@@ -65,6 +120,7 @@ class MedToCehrGPTDatasetMapping(DatasetMapping):
     def _update_cehrgpt_record(
         cehrgpt_record: Dict[str, Any],
         code: str,
+        time: datetime.datetime,
         concept_value_mask: int = 0,
         number_as_value: float = 0.0,
         concept_as_value: str = "0",
@@ -77,6 +133,9 @@ class MedToCehrGPTDatasetMapping(DatasetMapping):
         cehrgpt_record["concept_as_values"].append(concept_as_value)
         cehrgpt_record["units"].append(unit)
         cehrgpt_record["is_numeric_types"].append(is_numeric_type)
+        cehrgpt_record["epoch_times"].append(
+            time.replace(tzinfo=datetime.timezone.utc).timestamp()
+        )
     def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
         cehrgpt_record = {
@@ -87,13 +146,16 @@ class MedToCehrGPTDatasetMapping(DatasetMapping):
             "concept_as_values": [],
             "units": [],
             "is_numeric_types": [],
+            "epoch_times": [],
         }
         # Extract the demographic information
         birth_datetime = record["birth_datetime"]
         if isinstance(birth_datetime, pd.Timestamp):
             birth_datetime = birth_datetime.to_pydatetime()
         gender = record["gender"]
+        gender = GENDER_UNKNOWN_TOKEN if gender == UNKNOWN_VALUE else gender
         race = record["race"]
+        race = RACE_UNKNOWN_TOKEN if race == UNKNOWN_VALUE else race
         visits = record["visits"]
         # This indicates this is columnar format
         if isinstance(visits, dict):
@@ -108,10 +170,12 @@ class MedToCehrGPTDatasetMapping(DatasetMapping):
         )
         year_str = f"year:{str(first_visit_start_datetime.year)}"
         age_str = f"age:{str(relativedelta(first_visit_start_datetime, birth_datetime).years)}"
-        self._update_cehrgpt_record(cehrgpt_record, year_str)
-        self._update_cehrgpt_record(cehrgpt_record, age_str)
-        self._update_cehrgpt_record(cehrgpt_record, gender)
-        self._update_cehrgpt_record(cehrgpt_record, race)
+        self._update_cehrgpt_record(
+            cehrgpt_record, year_str, first_visit_start_datetime
+        )
+        self._update_cehrgpt_record(cehrgpt_record, age_str, first_visit_start_datetime)
+        self._update_cehrgpt_record(cehrgpt_record, gender, first_visit_start_datetime)
+        self._update_cehrgpt_record(cehrgpt_record, race, first_visit_start_datetime)
         # Use a data cursor to keep track of time
         datetime_cursor: Optional[datetime.datetime] = None
@@ -146,6 +210,7 @@ class MedToCehrGPTDatasetMapping(DatasetMapping):
                 self._update_cehrgpt_record(
                     cehrgpt_record,
                     code=self._time_token_function(time_delta),
+                    time=visit_start_datetime,
                 )
             datetime_cursor = visit_start_datetime
@@ -153,11 +218,13 @@ class MedToCehrGPTDatasetMapping(DatasetMapping):
             self._update_cehrgpt_record(
                 cehrgpt_record,
                 code="[VS]",
+                time=datetime_cursor,
             )
             # Add a visit type token
             self._update_cehrgpt_record(
                 cehrgpt_record,
                 code=visit_type,
+                time=datetime_cursor,
             )
             # We need to insert an inpatient hour token right after the visit type, we calculate the hour interval
             # with respect to the midnight of the day
@@ -167,6 +234,7 @@ class MedToCehrGPTDatasetMapping(DatasetMapping):
                     self._update_cehrgpt_record(
                         cehrgpt_record,
                         code=f"i-H{datetime_cursor.hour}",
+                        time=datetime_cursor,
                     )
             # Keep track of the existing outpatient events, we don't want to add them again
@@ -185,6 +253,10 @@ class MedToCehrGPTDatasetMapping(DatasetMapping):
                 concept_value_mask = int(
                     numeric_value is not None or text_value is not None
                 )
+                if numeric_value is None and text_value is not None:
+                    if text_value.isnumeric():
+                        numeric_value = float(text_value)
                 is_numeric_type = int(numeric_value is not None)
                 code = replace_escape_chars(e["code"])
@@ -208,6 +280,7 @@ class MedToCehrGPTDatasetMapping(DatasetMapping):
                         self._update_cehrgpt_record(
                             cehrgpt_record,
                             code=f"i-{self._inpatient_time_token_function(time_diff_days)}",
+                            time=event_time,
                         )
                     if self._include_inpatient_hour_token:
@@ -226,6 +299,7 @@ class MedToCehrGPTDatasetMapping(DatasetMapping):
                             self._update_cehrgpt_record(
                                 cehrgpt_record,
                                 code=f"i-H{time_diff_hours}",
+                                time=event_time,
                             )
                 if event_identity in existing_duplicate_events:
@@ -234,6 +308,7 @@ class MedToCehrGPTDatasetMapping(DatasetMapping):
                 self._update_cehrgpt_record(
                     cehrgpt_record,
                     code=code,
+                    time=event_time,
                     concept_value_mask=concept_value_mask,
                     unit=unit,
                     number_as_value=numeric_value if numeric_value else 0.0,
@@ -262,17 +337,24 @@ class MedToCehrGPTDatasetMapping(DatasetMapping):
                     # facility event
                     discharge_facility = get_value(visit, "discharge_facility")
                     if not discharge_facility:
-                        discharge_facility = "0"
+                        discharge_facility = DISCHARGE_UNKNOWN_TOKEN
+                    else:
+                        discharge_facility = (
+                            DISCHARGE_UNKNOWN_TOKEN
+                            if discharge_facility == UNKNOWN_VALUE
+                            else discharge_facility
+                        )
                     self._update_cehrgpt_record(
                         cehrgpt_record,
                         code=discharge_facility,
+                        time=datetime_cursor,
                     )
             # Reuse the age and date calculated for the last event in the patient timeline
             self._update_cehrgpt_record(
                 cehrgpt_record,
                 code="[VE]",
+                time=datetime_cursor,
             )
         # Generate the orders of the concepts that the cehrbert dataset mapping function expects
@@ -284,17 +366,23 @@ class MedToCehrGPTDatasetMapping(DatasetMapping):
         cehrgpt_record["num_of_concepts"] = len(cehrgpt_record["concept_ids"])
         cehrgpt_record["num_of_visits"] = len(visits)
-        if record.get("index_date", None):
-            cehrgpt_record["index_date"] = record["index_date"]
-        if record.get("label", None):
+        if record.get("index_date", None) is not None:
+            cehrgpt_record["index_date"] = (
+                record["index_date"].replace(tzinfo=datetime.timezone.utc).timestamp()
+            )
+        if record.get("label", None) is not None:
             cehrgpt_record["label"] = record["label"]
-        if record.get("age_at_index", None):
+        if record.get("age_at_index", None) is not None:
             cehrgpt_record["age_at_index"] = record["age_at_index"]
+        assert len(cehrgpt_record["epoch_times"]) == len(
+            cehrgpt_record["concept_ids"]
+        ), "The number of time stamps must match with the number of concepts in the sequence"
         return cehrgpt_record
-class HFCehrGptTokenizationMapping(DatasetMapping):
+class HFCehrGptTokenizationMapping(DatasetMappingDecorator):
     def __init__(
         self,
         concept_tokenizer: CehrGptTokenizer,
@@ -308,9 +396,46 @@ class HFCehrGptTokenizationMapping(DatasetMapping):
             "is_numeric_types",
         ]
+    def filter_out_invalid_tokens(self, record: Dict[str, Any]) -> Dict[str, Any]:
+        column_names = []
+        seq_length = len(record["concept_ids"])
+        # We can't have "0" as a token in the tokenizer because it would break tokenization for "Race/0", "Visit/0"
+        # This is a pre-caution
+        if "0" in record["concept_ids"]:
+            if isinstance(record["concept_ids"], np.ndarray):
+                record["concept_ids"][record["concept_ids"] == "0"] = "Unknown"
+            else:
+                record["concept_ids"] = [
+                    "Unknown" if x == "0" else x for x in record["concept_ids"]
+                ]
+        for k, v in record.items():
+            if k not in CEHRGPT_COLUMNS:
+                continue
+            if isinstance(v, (list, np.ndarray)) and len(v) == seq_length:
+                column_names.append(k)
+        valid_concept_ids = self._concept_tokenizer.get_vocab().keys()
+        valid_indices = [
+            idx
+            for idx, concept_id in enumerate(record["concept_ids"])
+            if concept_id in valid_concept_ids
+        ]
+        if len(valid_indices) != len(record["concept_ids"]):
+            for column in column_names:
+                values = record[column]
+                record[column] = [values[idx] for idx in valid_indices]
+        return record
     def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
+        # Remove the tokens from patient sequences that do not exist in the tokenizer
+        record = self.filter_out_invalid_tokens(record)
         # If any concept has a value associated with it, we normalize the value
         record["input_ids"] = self._concept_tokenizer.encode(record["concept_ids"])
+        assert len(record["input_ids"]) == len(record["concept_ids"]), (
+            "The number of tokens must equal to the number of concepts\n"
+            f"decoded concept_ids: {self._concept_tokenizer.decode(record['input_ids'], skip_special_tokens=False)}"
+        )
         record["value_indicators"] = record["concept_value_masks"]
         if "number_as_values" not in record or "concept_as_values" not in record:
             record["number_as_values"] = [
@@ -391,3 +516,93 @@ class HFFineTuningMapping(HFCehrGptTokenizationMapping):
         columns = super().remove_columns()
         columns.append("label")
         return columns
+class ExtractTokenizedSequenceDataMapping:
+    def __init__(
+        self,
+        person_index_date_map: Dict[int, List[Dict[str, Any]]],
+        observation_window: int = 0,
+    ):
+        self.person_index_date_map = person_index_date_map
+        self.observation_window = observation_window
+    def _calculate_prediction_start_time(self, prediction_time: float):
+        if self.observation_window and self.observation_window > 0:
+            return max(prediction_time - self.observation_window * 24 * 3600, 0)
+        return 0
+    def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
+        person_id = record["person_id"]
+        prediction_times = self.person_index_date_map[person_id]
+        prediction_start_end_times = [
+            (
+                self._calculate_prediction_start_time(
+                    prediction_time_label_map["index_date"]
+                    .replace(tzinfo=datetime.timezone.utc)
+                    .timestamp()
+                ),
+                prediction_time_label_map["index_date"]
+                .replace(tzinfo=datetime.timezone.utc)
+                .timestamp(),
+                prediction_time_label_map["label"],
+            )
+            for prediction_time_label_map in prediction_times
+        ]
+        observation_window_indices = np.zeros(
+            (len(prediction_times), len(record["epoch_times"])), dtype=bool
+        )
+        for i, epoch_time in enumerate(record["epoch_times"]):
+            for sample_n, (
+                feature_extraction_time_start,
+                feature_extraction_end_end,
+                _,
+            ) in enumerate(prediction_start_end_times):
+                if (
+                    feature_extraction_time_start
+                    <= epoch_time
+                    <= feature_extraction_end_end
+                ):
+                    observation_window_indices[sample_n][i] = True
+        seq_length = len(record["epoch_times"])
+        time_series_columns = ["concept_ids", "input_ids"]
+        static_inputs = dict()
+        for k, v in record.items():
+            if k in ["concept_ids", "input_ids"]:
+                continue
+            if isinstance(v, (list, np.ndarray)) and len(v) == seq_length:
+                time_series_columns.append(k)
+            else:
+                static_inputs[k] = v
+        batched_samples = defaultdict(list)
+        for (_, index_date, label), observation_window_index in zip(
+            prediction_start_end_times, observation_window_indices
+        ):
+            for k, v in static_inputs.items():
+                batched_samples[k].append(v)
+            batched_samples["classifier_label"].append(label)
+            batched_samples["index_date"].append(index_date)
+            try:
+                start_age = int(record["concept_ids"][1].split(":")[1])
+            except Exception:
+                start_age = -1
+            batched_samples["age_at_index"].append(start_age)
+            for time_series_column in time_series_columns:
+                batched_samples[time_series_column].append(
+                    np.asarray(record[time_series_column])[observation_window_index]
+                )
+        return batched_samples
+    def batch_transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
+        all_batched_record = defaultdict(list)
+        all_columns = record.keys()
+        for i in range(len(record["concept_ids"])):
+            one_record = {}
+            for column in all_columns:
+                one_record[column] = record[column][i]
+            new_batched_record = self.transform(one_record)
+            for k, v in new_batched_record.items():
+                all_batched_record[k].extend(v)
+        return all_batched_record

cehrgpt/data/sample_packing_sampler.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from typing import Iterator, List, Optional
+import numpy as np
 import torch
 import torch.distributed as dist
 from torch.utils.data import Sampler
@@ -33,6 +34,8 @@ class SamplePackingBatchSampler(Sampler[List[int]]):
         rank: Optional[int] = None,
         seed: int = 0,
         drop_last: bool = False,
+        negative_sampling_probability: Optional[float] = None,
+        labels: Optional[List[int]] = None,
     ):
         """
         Args:
@@ -73,6 +76,11 @@ class SamplePackingBatchSampler(Sampler[List[int]]):
                 f"Invalid rank {rank}, rank should be in the interval [0, {num_replicas - 1}]"
             )
+        if negative_sampling_probability is not None and labels is None:
+            raise ValueError(
+                f"When the negative sampling probability is provide, the labels must be provided as well"
+            )
         self.lengths = lengths
         self.max_tokens_per_batch = max_tokens_per_batch
         self.max_position_embeddings = max_position_embeddings
@@ -80,6 +88,8 @@ class SamplePackingBatchSampler(Sampler[List[int]]):
         self.rank = rank
         self.seed = seed
         self.drop_last = drop_last
+        self.negative_sampling_probability = negative_sampling_probability
+        self.labels = labels
         # Trainer https://github.com/huggingface/transformers/blame/main/src/transformers/trainer.py#L2470
         # http://github.com/huggingface/accelerate/blob/v0.31.0/src/accelerate/data_loader.py#L482
         # the huggingface trainer will call the accelerate.data_loader.DataLoaderShard.set_epoch,
@@ -100,6 +110,14 @@ class SamplePackingBatchSampler(Sampler[List[int]]):
         current_batch_tokens = 0
         for idx in indices:
+            # There is a chance to skip the negative samples to account for the class imbalance
+            # in the fine-tuning dataset
+            if self.negative_sampling_probability:
+                if (
+                    np.random.random() > self.negative_sampling_probability
+                    and self.labels[idx] == 0
+                ):
+                    continue
             # We take the minimum of the two because each sequence will be truncated to fit
             # the context window of the model
             sample_length = min(self.lengths[idx], self.max_position_embeddings)
@@ -131,10 +149,22 @@ class SamplePackingBatchSampler(Sampler[List[int]]):
         if len(self.lengths) == 0:
             return 0
-        # We need to truncate the lengths due to the context window limit imposed by the model
-        truncated_lengths = [
-            min(self.max_position_embeddings, length + 2) for length in self.lengths
-        ]
+        # There is a chance to skip the negative samples to account for the class imbalance
+        # in the fine-tuning dataset
+        if self.negative_sampling_probability:
+            truncated_lengths = []
+            for length, label in zip(self.lengths, self.labels):
+                if (
+                    np.random.random() > self.negative_sampling_probability
+                    and label == 0
+                ):
+                    continue
+                truncated_lengths.append(length)
+        else:
+            # We need to truncate the lengths due to the context window limit imposed by the model
+            truncated_lengths = [
+                min(self.max_position_embeddings, length + 2) for length in self.lengths
+            ]
         # Calculate average sequence length
         avg_seq_length = sum(truncated_lengths) // len(truncated_lengths)
@@ -145,7 +175,7 @@ class SamplePackingBatchSampler(Sampler[List[int]]):
         # Estimate total number of batches
         if self.drop_last:
             # If dropping last incomplete batch
-            return len(truncated_lengths) // seqs_per_batch * self.num_replicas
+            return len(truncated_lengths) // seqs_per_batch
         else:
             # If keeping last incomplete batch, ensure at least 1 batch
-            return max(1, len(truncated_lengths) // seqs_per_batch) * self.num_replicas
+            return max(1, len(truncated_lengths) // seqs_per_batch)

cehrgpt 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

cehrgpt 0.1.0py3-none-any.whl → 0.1.2py3-none-any.whl