PyPI - cehrgpt - Versions diffs - 0.0.1__py3-none-any.whl - Mend

cehrgpt 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

__init__.py +0 -0
cehrgpt/__init__.py +0 -0
cehrgpt/analysis/__init__.py +0 -0
cehrgpt/analysis/privacy/__init__.py +0 -0
cehrgpt/analysis/privacy/attribute_inference.py +275 -0
cehrgpt/analysis/privacy/attribute_inference_config.yml +8975 -0
cehrgpt/analysis/privacy/member_inference.py +172 -0
cehrgpt/analysis/privacy/nearest_neighbor_inference.py +189 -0
cehrgpt/analysis/privacy/reid_inference.py +407 -0
cehrgpt/analysis/privacy/utils.py +255 -0
cehrgpt/cehrgpt_args.py +142 -0
cehrgpt/data/__init__.py +0 -0
cehrgpt/data/hf_cehrgpt_dataset.py +80 -0
cehrgpt/data/hf_cehrgpt_dataset_collator.py +482 -0
cehrgpt/data/hf_cehrgpt_dataset_mapping.py +116 -0
cehrgpt/generation/__init__.py +0 -0
cehrgpt/generation/chatgpt_generation.py +106 -0
cehrgpt/generation/generate_batch_hf_gpt_sequence.py +333 -0
cehrgpt/generation/omop_converter_batch.py +644 -0
cehrgpt/generation/omop_entity.py +515 -0
cehrgpt/gpt_utils.py +331 -0
cehrgpt/models/__init__.py +0 -0
cehrgpt/models/config.py +205 -0
cehrgpt/models/hf_cehrgpt.py +1817 -0
cehrgpt/models/hf_modeling_outputs.py +158 -0
cehrgpt/models/pretrained_embeddings.py +82 -0
cehrgpt/models/special_tokens.py +30 -0
cehrgpt/models/tokenization_hf_cehrgpt.py +1077 -0
cehrgpt/omop/__init__.py +0 -0
cehrgpt/omop/condition_era.py +20 -0
cehrgpt/omop/observation_period.py +43 -0
cehrgpt/omop/omop_argparse.py +38 -0
cehrgpt/omop/omop_table_builder.py +86 -0
cehrgpt/omop/queries/__init__.py +0 -0
cehrgpt/omop/queries/condition_era.py +86 -0
cehrgpt/omop/queries/observation_period.py +135 -0
cehrgpt/omop/sample_omop_tables.py +71 -0
cehrgpt/runners/__init__.py +0 -0
cehrgpt/runners/gpt_runner_util.py +99 -0
cehrgpt/runners/hf_cehrgpt_finetune_runner.py +746 -0
cehrgpt/runners/hf_cehrgpt_pretrain_runner.py +370 -0
cehrgpt/runners/hf_gpt_runner_argument_dataclass.py +137 -0
cehrgpt/runners/hyperparameter_search_util.py +223 -0
cehrgpt/time_to_event/__init__.py +0 -0
cehrgpt/time_to_event/config/30_day_readmission.yaml +8 -0
cehrgpt/time_to_event/config/next_visit_type_prediction.yaml +8 -0
cehrgpt/time_to_event/config/t2dm_hf.yaml +8 -0
cehrgpt/time_to_event/time_to_event_model.py +226 -0
cehrgpt/time_to_event/time_to_event_prediction.py +347 -0
cehrgpt/time_to_event/time_to_event_utils.py +55 -0
cehrgpt/tools/__init__.py +0 -0
cehrgpt/tools/ehrshot_benchmark.py +74 -0
cehrgpt/tools/generate_pretrained_embeddings.py +130 -0
cehrgpt/tools/merge_synthetic_real_dataasets.py +218 -0
cehrgpt/tools/upload_omop_tables.py +108 -0
cehrgpt-0.0.1.dist-info/LICENSE +21 -0
cehrgpt-0.0.1.dist-info/METADATA +66 -0
cehrgpt-0.0.1.dist-info/RECORD +60 -0
cehrgpt-0.0.1.dist-info/WHEEL +5 -0
cehrgpt-0.0.1.dist-info/top_level.txt +2 -0

cehrgpt/gpt_utils.py ADDED Viewed

@@ -0,0 +1,331 @@
+import random
+import re
+from datetime import date, timedelta
+from typing import List, Sequence, Tuple
+from cehrgpt.cehrgpt_args import SamplingStrategy
+from cehrgpt.models.special_tokens import (
+    DISCHARGE_CONCEPT_IDS,
+    END_TOKEN,
+    VISIT_CONCEPT_IDS,
+)
+# Regular expression pattern to match inpatient attendance tokens
+INPATIENT_ATT_PATTERN = re.compile(r"(?:VS-|i-)D(\d+)(?:-VE)?")
+DEMOGRAPHIC_PROMPT_SIZE = 4
+class RandomSampleCache:
+    def __init__(
+        self,
+        data_indices: Sequence[int],
+        cache_size: int,
+        sample_weights: Sequence[float] = None,
+    ):
+        """
+        Initialize the RandomSampleCache.
+        :param data_indices: Sequence of data indices to sample from.
+        :param cache_size: Size of the cache.
+        :param sample_weights: Optional sequence of weights for sampling.
+        """
+        self._data_indices = data_indices
+        self._sample_weights = sample_weights
+        self._cache_size = cache_size
+        self._cache = []
+        if self._sample_weights is not None:
+            assert sum(self._sample_weights) - 1 < 1e-8
+    def next(self):
+        """
+        Get the next sample from the cache.
+        If the cache is empty, refill it.
+        :return: A sampled data index.
+        """
+        if not self._cache:
+            if self._sample_weights is not None:
+                self._cache.extend(
+                    random.choices(
+                        self._data_indices,
+                        k=self._cache_size,
+                        weights=self._sample_weights,
+                    )
+                )
+            else:
+                self._cache.extend(
+                    random.choices(self._data_indices, k=self._cache_size)
+                )
+        return self._cache.pop()
+def collect_demographic_prompts_at_visits(patient_history: List[str]):
+    demographic_prompts_at_visits = []
+    start_year, start_age, start_gender, start_race = patient_history[
+        :DEMOGRAPHIC_PROMPT_SIZE
+    ]
+    try:
+        start_year = int(start_year.split(":")[1])
+        start_age = int(start_age.split(":")[1])
+        valid_prompt = True
+    except IndexError | ValueError:
+        start_year = 1900
+        start_age = 0
+        valid_prompt = False
+    data_cursor = date(int(start_year), 1, 1)
+    birth_date = date(start_year - start_age, 1, 1)
+    for i, current_token in enumerate(patient_history):
+        if is_visit_start(current_token):
+            reconstructed_year = (
+                f"year:{data_cursor.year}" if valid_prompt else "year:unknown"
+            )
+            reconstructed_age = (
+                f"age:{data_cursor.year - birth_date.year}"
+                if valid_prompt
+                else "age:unknown"
+            )
+            demographic_prompts_at_visits.append(
+                (
+                    i,
+                    (
+                        reconstructed_year,
+                        reconstructed_age,
+                        start_gender,
+                        start_race,
+                    ),
+                )
+            )
+        elif is_att_token(current_token):
+            att_date_delta = extract_time_interval_in_days(current_token)
+            data_cursor = data_cursor + timedelta(days=att_date_delta)
+    return demographic_prompts_at_visits
+def random_slice_gpt_sequence(concept_ids, max_seq_len):
+    """
+    Randomly slice a GPT sequence.
+    :param concept_ids: List of concept IDs.
+    :param max_seq_len: Maximum sequence length.
+    :return: Tuple containing start index, end index, and demographic tokens.
+    """
+    seq_length = len(concept_ids)
+    starting_points = []
+    start_year, start_age, start_gender, start_race = [
+        _ for _ in concept_ids[:DEMOGRAPHIC_PROMPT_SIZE]
+    ]
+    try:
+        start_year = int(start_year.split(":")[1])
+        start_age = int(start_age.split(":")[1])
+        data_cursor = date(int(start_year), 1, 1)
+        birth_date = date(start_year - start_age, 1, 1)
+        for i in range(
+            DEMOGRAPHIC_PROMPT_SIZE,
+            min(seq_length, seq_length - max_seq_len + DEMOGRAPHIC_PROMPT_SIZE),
+        ):
+            current_token = concept_ids[i]
+            if is_visit_start(current_token):
+                starting_points.append(
+                    (i, data_cursor.year, data_cursor.year - birth_date.year)
+                )
+            elif is_att_token(current_token):
+                att_date_delta = extract_time_interval_in_days(current_token)
+                data_cursor = data_cursor + timedelta(days=att_date_delta)
+        if len(starting_points) == 0:
+            return 0, 0, concept_ids[:DEMOGRAPHIC_PROMPT_SIZE]
+        random_starting_index, random_starting_year, random_starting_age = (
+            random.choice(starting_points)
+        )
+        demographic_tokens = [
+            f"year:{random_starting_year}",
+            f"age:{random_starting_age}",
+            start_gender,
+            start_race,
+        ]
+        # Remove the number of demographic tokens
+        random_end_index = random_starting_index
+        for i in reversed(
+            range(
+                random_starting_index,
+                random_starting_index + max_seq_len - DEMOGRAPHIC_PROMPT_SIZE,
+            )
+        ):
+            current_token = concept_ids[i]
+            if current_token == "VE":
+                random_end_index = i
+                break
+        return random_starting_index, random_end_index, demographic_tokens
+    except Exception:
+        return 0, max_seq_len - 1, []
+def get_cehrgpt_output_folder(args, cehrgpt_tokenizer) -> str:
+    if args.sampling_strategy == SamplingStrategy.TopKStrategy.value:
+        folder_name = f"top_k{args.top_k}"
+        args.top_p = 1.0
+    elif args.sampling_strategy == SamplingStrategy.TopPStrategy.value:
+        folder_name = f"top_p{int(args.top_p * 10000)}"
+        args.top_k = cehrgpt_tokenizer.vocab_size
+    elif args.sampling_strategy == SamplingStrategy.TopMixStrategy.value:
+        folder_name = f"top_mix_p{int(args.top_p * 10000)}_k{args.top_k}"
+    else:
+        raise RuntimeError(
+            "sampling_strategy has to be one of the following three options [TopKStrategy, TopPStrategy, TopMixStrategy]"
+        )
+    if args.temperature != 1.0:
+        folder_name = f"{folder_name}_temp_{int(args.temperature * 10000)}"
+    if args.repetition_penalty != 1.0:
+        folder_name = (
+            f"{folder_name}_repetition_penalty_{int(args.repetition_penalty * 10000)}"
+        )
+    if args.num_beams > 1:
+        folder_name = f"{folder_name}_num_beams_{int(args.num_beams)}"
+    if args.num_beam_groups > 1:
+        folder_name = f"{folder_name}_num_beam_groups_{int(args.num_beam_groups)}"
+    if args.epsilon_cutoff > 0.0:
+        folder_name = (
+            f"{folder_name}_epsilon_cutoff_{int(args.epsilon_cutoff * 100000)}"
+        )
+    return folder_name
+def is_clinical_event(token: str) -> bool:
+    return token.isnumeric()
+def is_visit_start(token: str):
+    """
+    Check if the token indicates the start of a visit.
+    :param token: Token to check.
+    :return: True if the token is a visit start token, False otherwise.
+    """
+    return token in ["VS", "[VS]"]
+def is_visit_end(token: str) -> bool:
+    return token in ["VE", "[VE]"]
+def is_att_token(token: str):
+    """
+    Check if the token is an attention token.
+    :param token: Token to check.
+    :return: True if the token is an attention token, False otherwise.
+    """
+    if bool(re.match(r"^D\d+", token)):  # day tokens
+        return True
+    elif bool(re.match(r"^W\d+", token)):  # week tokens
+        return True
+    elif bool(re.match(r"^M\d+", token)):  # month tokens
+        return True
+    elif bool(re.match(r"^Y\d+", token)):  # year tokens
+        return True
+    elif token == "LT":
+        return True
+    elif token[:3] == "VS-":  # VS-D7-VE
+        return True
+    elif token[:2] == "i-" and not token.startswith(
+        "i-H"
+    ):  # i-D7 and exclude hour tokens
+        return True
+    return False
+def is_artificial_token(token: str) -> bool:
+    if token in VISIT_CONCEPT_IDS:
+        return True
+    if token in DISCHARGE_CONCEPT_IDS:
+        return True
+    if is_visit_start(token):
+        return True
+    if is_visit_end(token):
+        return True
+    if is_att_token(token):
+        return True
+    if token == END_TOKEN:
+        return True
+    return False
+def is_inpatient_att_token(token: str):
+    """
+    Check if the token is an inpatient ATT token.
+    :param token: Token to check.
+    :return: True if the token is an inpatient ATT token, False otherwise.
+    """
+    return INPATIENT_ATT_PATTERN.match(token)
+def extract_time_interval_in_days(token: str):
+    """
+    Extract the time interval in days from a token.
+    :param token: Token to extract from.
+    :return: Time interval in days.
+    :raises ValueError: If the token is invalid.
+    """
+    try:
+        if token[0] == "D":  # day tokens
+            return int(token[1:])
+        elif token[0] == "W":  # week tokens
+            return int(token[1:]) * 7
+        elif token[0] == "M":  # month tokens
+            return int(token[1:]) * 30
+        elif token[0] == "Y":  # year tokens
+            return int(token[1:]) * 365
+        elif token == "LT":
+            return 365 * 3
+        elif token[:3] == "VS-":  # VS-D7-VE
+            part = token.split("-")[1]
+            if part.startswith("LT"):
+                return 365 * 3
+            return int(part[1:])
+        elif token[:2] == "i-":  # i-D7
+            part = token.split("-")[1]
+            if part.startswith("LT"):
+                return 365 * 3
+            return int(token.split("-")[1][1:])
+    except Exception:
+        raise ValueError(f"Invalid time token: {token}")
+    raise ValueError(f"Invalid time token: {token}")
+def convert_time_interval_to_time_tuple(
+    time_interval: int, is_inpatient: bool
+) -> Tuple[str, str, str]:
+    """
+    Convert a time interval to a tuple of time tokens.
+    :param time_interval: Time interval in days.
+    :param is_inpatient: Whether the interval is for an inpatient.
+    :return: Tuple of year, month, and day tokens.
+    """
+    assert time_interval >= 0, "the time interval must equal and greater than zero"
+    year = time_interval // 365
+    month = time_interval % 365 // 30
+    day = time_interval % 365 % 30
+    year_token = f"year:{year}"
+    month_token = f"month:{month}"
+    day_token = f"i-day:{day}" if is_inpatient else f"day:{day}"
+    return year_token, month_token, day_token
+def generate_artificial_time_tokens():
+    """
+    Generate all the time tokens used in training.
+    :return: List of time tokens.
+    """
+    day_tokens = [f"D{i}" for i in range(2000)]
+    week_tokens = [f"W{i}" for i in range(4)]
+    month_tokens = [f"M{i}" for i in range(12)]
+    long_term_tokens = ["LT"]
+    return day_tokens + week_tokens + month_tokens + long_term_tokens

cehrgpt/models/__init__.py ADDED Viewed

File without changes

cehrgpt/models/config.py ADDED Viewed

@@ -0,0 +1,205 @@
+from typing import Dict, List
+from transformers import PretrainedConfig
+class CEHRGPTConfig(PretrainedConfig):
+    """
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50257):
+            Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GPT2Model`] or [`TFGPT2Model`].
+        n_positions (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        n_embd (`int`, *optional*, defaults to 768):
+            Dimensionality of the embeddings and hidden states.
+        n_layer (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        n_inner (`int`, *optional*):
+            Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
+        activation_function (`str`, *optional*, defaults to `"gelu_new"`):
+            Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
+            The epsilon to use in the layer normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        summary_type (`string`, *optional*, defaults to `"cls_index"`):
+            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
+            [`TFGPT2DoubleHeadsModel`].
+            Has to be one of the following options:
+                - `"last"`: Take the last token hidden state (like XLNet).
+                - `"first"`: Take the first token hidden state (like BERT).
+                - `"mean"`: Take the mean of all tokens hidden states.
+                - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
+                - `"attn"`: Not implemented now, use multi-head attention.
+        summary_use_proj (`bool`, *optional*, defaults to `True`):
+            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
+            [`TFGPT2DoubleHeadsModel`].
+            Whether or not to add a projection after the vector extraction.
+        summary_activation (`str`, *optional*):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            [`GPT2DoubleHeadsModel`].
+            Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
+        summary_proj_to_labels (`bool`, *optional*, defaults to `True`):
+            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
+            [`TFGPT2DoubleHeadsModel`].
+            Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes.
+        summary_first_dropout (`float`, *optional*, defaults to 0.1):
+            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
+            [`TFGPT2DoubleHeadsModel`].
+            The dropout ratio to be used after the projection and activation.
+        scale_attn_weights (`bool`, *optional*, defaults to `True`):
+            Scale attention weights by dividing by sqrt(hidden_size)..
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        bos_token_id (`int`, *optional*, defaults to 50256):
+            Id of the beginning of sentence token in the vocabulary.
+        eos_token_id (`int`, *optional*, defaults to 50256):
+            Id of the end of sentence token in the vocabulary.
+        scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
+            Whether to additionally scale attention weights by `1 / layer_idx + 1`.
+        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
+            Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention
+            dot-product/softmax to float() when training with mixed precision.
+    """
+    model_type = "cehrgpt"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "hidden_size": "n_embd",
+        "max_position_embeddings": "n_positions",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+    }
+    @property
+    def token_to_time_token_mapping(self) -> Dict[int, List[int]]:
+        # The saved _token_to_time_token_mapping converts the key to string, so we need to convert it back to int
+        return {
+            int(token): list(map(int, sub_tokens))
+            for token, sub_tokens in self._token_to_time_token_mapping.items()
+        }
+    def __init__(
+        self,
+        vocab_size=50257,
+        time_token_vocab_size=50257,
+        n_positions=1024,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        n_inner=None,
+        activation_function="gelu_new",
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        summary_type="cls_index",
+        summary_use_proj=True,
+        summary_activation=None,
+        summary_proj_to_labels=True,
+        summary_first_dropout=0.1,
+        scale_attn_weights=True,
+        use_cache=True,
+        bos_token_id=50256,
+        eos_token_id=50256,
+        lab_token_ids=None,
+        scale_attn_by_inverse_layer_idx=False,
+        reorder_and_upcast_attn=False,
+        exclude_position_ids=False,
+        include_values=False,
+        value_vocab_size=None,
+        include_ttv_prediction=False,
+        use_sub_time_tokenization=True,
+        token_to_time_token_mapping: Dict[int, List] = None,
+        use_pretrained_embeddings=False,
+        n_pretrained_embeddings_layers=2,
+        pretrained_embedding_dim=768,
+        pretrained_token_ids: List[int] = None,
+        time_token_loss_weight=1.0,
+        time_to_visit_loss_weight=1.0,
+        causal_sfm=False,
+        demographics_size=4,
+        lab_token_penalty=False,
+        lab_token_loss_weight=0.9,
+        entropy_penalty=False,
+        entropy_penalty_alpha=0.01,
+        **kwargs,
+    ):
+        if token_to_time_token_mapping is None:
+            token_to_time_token_mapping = {}
+        if pretrained_token_ids is None:
+            pretrained_token_ids = list()
+        self.vocab_size = vocab_size
+        self.time_token_vocab_size = time_token_vocab_size
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_inner = n_inner
+        self.activation_function = activation_function
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.summary_type = summary_type
+        self.summary_use_proj = summary_use_proj
+        self.summary_activation = summary_activation
+        self.summary_first_dropout = summary_first_dropout
+        self.summary_proj_to_labels = summary_proj_to_labels
+        self.scale_attn_weights = scale_attn_weights
+        self.use_cache = use_cache
+        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
+        self.reorder_and_upcast_attn = reorder_and_upcast_attn
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.lab_token_ids = lab_token_ids
+        self.exclude_position_ids = exclude_position_ids
+        self.include_values = include_values
+        self.value_vocab_size = value_vocab_size
+        self.include_ttv_prediction = include_ttv_prediction
+        self.use_sub_time_tokenization = use_sub_time_tokenization
+        self._token_to_time_token_mapping = token_to_time_token_mapping
+        self.time_token_loss_weight = time_token_loss_weight
+        self.time_to_visit_loss_weight = time_to_visit_loss_weight
+        self.causal_sfm = causal_sfm
+        self.demographics_size = demographics_size
+        self.use_pretrained_embeddings = use_pretrained_embeddings
+        self.pretrained_embedding_dim = pretrained_embedding_dim
+        self.pretrained_token_ids = pretrained_token_ids
+        self.n_pretrained_embeddings_layers = n_pretrained_embeddings_layers
+        # self.tie_word_embeddings = not use_pretrained_embeddings
+        self.lab_token_penalty = lab_token_penalty
+        self.lab_token_loss_weight = lab_token_loss_weight
+        self.entropy_penalty = entropy_penalty
+        self.entropy_penalty_alpha = entropy_penalty_alpha
+        kwargs["tie_word_embeddings"] = not use_pretrained_embeddings
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+    @property
+    def lab_token_exists(self) -> bool:
+        return self.lab_token_ids is not None and len(self.lab_token_ids) > 0