PyPI - cehrgpt - Versions diffs - 0.0.1__py3-none-any.whl - Mend

cehrgpt 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

__init__.py +0 -0
cehrgpt/__init__.py +0 -0
cehrgpt/analysis/__init__.py +0 -0
cehrgpt/analysis/privacy/__init__.py +0 -0
cehrgpt/analysis/privacy/attribute_inference.py +275 -0
cehrgpt/analysis/privacy/attribute_inference_config.yml +8975 -0
cehrgpt/analysis/privacy/member_inference.py +172 -0
cehrgpt/analysis/privacy/nearest_neighbor_inference.py +189 -0
cehrgpt/analysis/privacy/reid_inference.py +407 -0
cehrgpt/analysis/privacy/utils.py +255 -0
cehrgpt/cehrgpt_args.py +142 -0
cehrgpt/data/__init__.py +0 -0
cehrgpt/data/hf_cehrgpt_dataset.py +80 -0
cehrgpt/data/hf_cehrgpt_dataset_collator.py +482 -0
cehrgpt/data/hf_cehrgpt_dataset_mapping.py +116 -0
cehrgpt/generation/__init__.py +0 -0
cehrgpt/generation/chatgpt_generation.py +106 -0
cehrgpt/generation/generate_batch_hf_gpt_sequence.py +333 -0
cehrgpt/generation/omop_converter_batch.py +644 -0
cehrgpt/generation/omop_entity.py +515 -0
cehrgpt/gpt_utils.py +331 -0
cehrgpt/models/__init__.py +0 -0
cehrgpt/models/config.py +205 -0
cehrgpt/models/hf_cehrgpt.py +1817 -0
cehrgpt/models/hf_modeling_outputs.py +158 -0
cehrgpt/models/pretrained_embeddings.py +82 -0
cehrgpt/models/special_tokens.py +30 -0
cehrgpt/models/tokenization_hf_cehrgpt.py +1077 -0
cehrgpt/omop/__init__.py +0 -0
cehrgpt/omop/condition_era.py +20 -0
cehrgpt/omop/observation_period.py +43 -0
cehrgpt/omop/omop_argparse.py +38 -0
cehrgpt/omop/omop_table_builder.py +86 -0
cehrgpt/omop/queries/__init__.py +0 -0
cehrgpt/omop/queries/condition_era.py +86 -0
cehrgpt/omop/queries/observation_period.py +135 -0
cehrgpt/omop/sample_omop_tables.py +71 -0
cehrgpt/runners/__init__.py +0 -0
cehrgpt/runners/gpt_runner_util.py +99 -0
cehrgpt/runners/hf_cehrgpt_finetune_runner.py +746 -0
cehrgpt/runners/hf_cehrgpt_pretrain_runner.py +370 -0
cehrgpt/runners/hf_gpt_runner_argument_dataclass.py +137 -0
cehrgpt/runners/hyperparameter_search_util.py +223 -0
cehrgpt/time_to_event/__init__.py +0 -0
cehrgpt/time_to_event/config/30_day_readmission.yaml +8 -0
cehrgpt/time_to_event/config/next_visit_type_prediction.yaml +8 -0
cehrgpt/time_to_event/config/t2dm_hf.yaml +8 -0
cehrgpt/time_to_event/time_to_event_model.py +226 -0
cehrgpt/time_to_event/time_to_event_prediction.py +347 -0
cehrgpt/time_to_event/time_to_event_utils.py +55 -0
cehrgpt/tools/__init__.py +0 -0
cehrgpt/tools/ehrshot_benchmark.py +74 -0
cehrgpt/tools/generate_pretrained_embeddings.py +130 -0
cehrgpt/tools/merge_synthetic_real_dataasets.py +218 -0
cehrgpt/tools/upload_omop_tables.py +108 -0
cehrgpt-0.0.1.dist-info/LICENSE +21 -0
cehrgpt-0.0.1.dist-info/METADATA +66 -0
cehrgpt-0.0.1.dist-info/RECORD +60 -0
cehrgpt-0.0.1.dist-info/WHEEL +5 -0
cehrgpt-0.0.1.dist-info/top_level.txt +2 -0

cehrgpt/generation/chatgpt_generation.py ADDED Viewed

@@ -0,0 +1,106 @@
+import os
+from textwrap import dedent
+import numpy as np
+from jinja2 import BaseLoader, Environment
+from openai import OpenAI
+from pydantic import BaseModel
+MODEL = "gpt-4o-2024-08-06"
+TEMPLATE = """
+You are a medical professional tasked with generating a synthetic patient sequence using the CEHR-GPT format, outlined as follows:
+[year]: Represents the start year of the patient sequence.
+[age]: Represents the start age of the patient sequence.
+[gender]: Patient's gender, allowed values are "Male," "Female," and "Unknown."
+[race]: Patient's race, allowed values are "White," "Black," "Asian," and "Unknown."
+[VS]: Marks the start of a visit.
+[VE]: Marks the end of a visit.
+[VT]: Type of visit, with allowed values "9202" (outpatient), "9201" (inpatient), and "9203" (emergency room).
+[C_i]: Clinical concept represented by an OMOP concept ID (could be a drug, condition, or procedure).
+[ATT]: Artificial time tokens, representing time intervals in days (e.g., "D1," "D10").
+[i-ATT]: Inpatient-specific artificial time tokens, representing intervals within inpatient stays (e.g., "i-D1"), these tokens should only appear in inpatient visits.
+Each sequence can encompass multiple concepts within each visit and vary from one to ten visits, reflective of real-world clinical scenarios. All clinical concepts must correspond to valid OMOP IDs. The sequence must end on [VE]
+Example of a sequence:
+{
+    "seq": ['year:2008', 'age:28', '8532', '8527', '[VS]', '9202', '4301351',
+       '19078924', '35603428', '35603429', '40221381', '40223365',
+       '4155151', '4239130', '42536500', '4294382', '2108974', '433736',
+       '[VE]', 'D7', '[VS]', '9201', '43011850', '35603429', '35603600',
+       '35605482', '40163870', '40169706', '40221381', '35603428',
+       '19078921', '40244026', '948080', '1154615', '1593063', '4056973',
+       '4155151', '4194550', '3047860', '35604843', '43011962', '4160730',
+       'i-D1', '35604843', '40162587', '43011962', '433736', '948080',
+       '0', '[VE]', 'D14', '[VS]', '9202', '4019497', '[VE]', 'D26', '[VS]',
+       '1', '4019497', '[VE]', 'D198', '[VS]', '581477', '433736',
+       '[VE]', 'D19', '[VS]', '581477', '194152', '320128', '40483287', '433736', '[VE]']
+}
+When creating the sequence, please use the demographic tokens {{ demographic_prompt }} to construct a realistic and medically plausible patient trajectory.
+"""
+class PatientSequence(BaseModel):
+    seq: list[str]
+if __name__ == "__main__":
+    import argparse
+    import uuid
+    import pandas as pd
+    from tqdm import tqdm
+    parser = argparse.ArgumentParser("ChatGPT patient generation")
+    parser.add_argument(
+        "--demographic_data",
+        dest="demographic_data",
+        action="store",
+        help="The path for your demographic_data",
+        required=True,
+    )
+    parser.add_argument(
+        "--output_folder",
+        dest="output_folder",
+        action="store",
+        help="The path for your output_folder",
+        required=True,
+    )
+    parser.add_argument(
+        "--num_sequences",
+        dest="num_sequences",
+        action="store",
+        type=int,
+        help="The path for your output_folder",
+        required=True,
+    )
+    args = parser.parse_args()
+    # Create a Jinja2 environment and render the template
+    env = Environment(loader=BaseLoader())
+    template = env.from_string(TEMPLATE)
+    demographics = pd.read_parquet(args.demographic_data)
+    for _ in tqdm(range(args.num_sequences)):
+        demographic_tokens = str(demographics.sample(1).concept_ids.iloc[0].tolist())
+        prompt = template.render(demographic_prompt=demographic_tokens)
+        client = OpenAI(api_key=os.environ.get("OPEN_AI_KEY"))
+        completion = client.beta.chat.completions.parse(
+            model=MODEL,
+            messages=[
+                {"role": "system", "content": "You are a medical professional."},
+                {"role": "user", "content": dedent(prompt)},
+            ],
+            response_format=PatientSequence,
+        )
+        patient_sequence = completion.choices[0].message.parsed.seq
+        pd.DataFrame(
+            [
+                {
+                    "concept_ids": patient_sequence,
+                    "concept_values": np.zeros_like(patient_sequence),
+                }
+            ],
+            columns=["concept_ids", "concept_values"],
+        ).to_parquet(os.path.join(args.output_folder, f"{uuid.uuid4()}.parquet"))

cehrgpt/generation/generate_batch_hf_gpt_sequence.py ADDED Viewed

@@ -0,0 +1,333 @@
+import datetime
+import os
+import random
+import uuid
+from typing import Any, Dict, List, Optional, Sequence, Tuple
+import numpy as np
+import pandas as pd
+import torch
+from cehrbert.runners.runner_util import load_parquet_as_dataset
+from transformers import GenerationConfig
+from transformers.utils import is_flash_attn_2_available, logging
+from cehrgpt.cehrgpt_args import create_inference_base_arg_parser
+from cehrgpt.generation.omop_converter_batch import START_TOKEN_SIZE
+from cehrgpt.gpt_utils import get_cehrgpt_output_folder
+from cehrgpt.models.hf_cehrgpt import CEHRGPT2LMHeadModel
+from cehrgpt.models.special_tokens import END_TOKEN
+from cehrgpt.models.tokenization_hf_cehrgpt import (
+    NA,
+    CehrGptTokenizer,
+    is_valid_valid_bin,
+)
+LOG = logging.get_logger("transformers")
+def normalize_value(
+    seq: Sequence[str],
+    values: Sequence[str],
+    tokenizer: CehrGptTokenizer,
+) -> Tuple[
+    Sequence[str],
+    Optional[Sequence[Optional[int]]],
+    Optional[Sequence[Optional[float]]],
+    Optional[Sequence[Optional[str]]],
+    Optional[Sequence[str]],
+]:
+    concepts = []
+    number_as_values = []
+    concept_as_values = []
+    is_numeric_types = []
+    units = []
+    for concept, value in zip(seq, values):
+        if concept == END_TOKEN:
+            break
+        number_as_value = None
+        concept_as_value = value if value and value.isnumeric() else None
+        is_numeric_type = 0
+        unit = NA
+        # If concept is numeric, we expect the next token to be a value bin
+        if is_valid_valid_bin(value):
+            converted_value, unit = tokenizer.denormalize(concept, value)
+            if isinstance(converted_value, float):
+                number_as_value = converted_value
+                is_numeric_type = 1
+        concepts.append(concept)
+        number_as_values.append(number_as_value)
+        concept_as_values.append(concept_as_value)
+        is_numeric_types.append(is_numeric_type)
+        units.append(unit)
+    return (
+        concepts,
+        is_numeric_types,
+        number_as_values,
+        concept_as_values,
+        units,
+    )
+def generate_single_batch(
+    model: CEHRGPT2LMHeadModel,
+    tokenizer: CehrGptTokenizer,
+    prompts: List[List[int]],
+    max_new_tokens=512,
+    mini_num_of_concepts=1,
+    top_p=0.95,
+    top_k=50,
+    temperature=1.0,
+    repetition_penalty=1.0,
+    num_beams=1,
+    num_beam_groups=1,
+    epsilon_cutoff=0.0,
+    device: Any = "cpu",
+) -> Dict[str, Any]:
+    with torch.no_grad():
+        generation_config = GenerationConfig(
+            repetition_penalty=repetition_penalty,
+            max_length=max_new_tokens,
+            min_length=mini_num_of_concepts,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            bos_token_id=tokenizer.end_token_id,
+            eos_token_id=tokenizer.end_token_id,
+            pad_token_id=tokenizer.pad_token_id,
+            do_sample=True,
+            use_cache=True,
+            return_dict_in_generate=True,
+            output_attentions=False,
+            output_hidden_states=False,
+            output_scores=False,
+            renormalize_logits=True,
+            num_beams=num_beams,
+            num_beam_groups=num_beam_groups,
+            epsilon_cutoff=epsilon_cutoff,
+        )
+        batched_prompts = torch.tensor(prompts).to(device)
+        results = model.generate(
+            inputs=batched_prompts,
+            generation_config=generation_config,
+            lab_token_ids=tokenizer.lab_token_ids,
+        )
+    sequences = [
+        tokenizer.decode(seq.cpu().numpy(), skip_special_tokens=False)
+        for seq in results.sequences
+    ]
+    if results.sequence_vals is not None:
+        values = [
+            tokenizer.decode_value(values.cpu().numpy(), skip_special_tokens=False)
+            for values in results.sequence_vals
+        ]
+    else:
+        values = np.zeros_like(sequences)
+        values.fill(NA)
+    if results.sequence_val_masks is not None:
+        value_indicators = results.sequence_val_masks.cpu().numpy()
+    else:
+        value_indicators = np.zeros_like(sequences, dtype=np.int32).astype(bool)
+    return {
+        "sequences": sequences,
+        "values": values,
+        "value_indicators": value_indicators,
+    }
+def main(args):
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+    else:
+        device = torch.device("cpu")
+    cehrgpt_tokenizer = CehrGptTokenizer.from_pretrained(args.tokenizer_folder)
+    cehrgpt_model = (
+        CEHRGPT2LMHeadModel.from_pretrained(
+            args.model_folder,
+            attn_implementation=(
+                "flash_attention_2" if is_flash_attn_2_available() else "eager"
+            ),
+            torch_dtype=(
+                torch.bfloat16
+                if is_flash_attn_2_available() and args.use_bfloat16
+                else torch.float32
+            ),
+        )
+        .eval()
+        .to(device)
+    )
+    cehrgpt_model.generation_config.pad_token_id = cehrgpt_tokenizer.pad_token_id
+    cehrgpt_model.generation_config.eos_token_id = cehrgpt_tokenizer.end_token_id
+    cehrgpt_model.generation_config.bos_token_id = cehrgpt_tokenizer.end_token_id
+    folder_name = get_cehrgpt_output_folder(args, cehrgpt_tokenizer)
+    output_folder_name = os.path.join(
+        args.output_folder, folder_name, "generated_sequences"
+    )
+    if not os.path.exists(output_folder_name):
+        os.makedirs(output_folder_name)
+    # Determine whether we will use the demographics with the long sequences
+    max_seq_allowed = (
+        cehrgpt_model.config.n_positions
+        if args.drop_long_sequences
+        else np.iinfo(np.int32).max
+    )
+    LOG.info(f"Loading tokenizer at {args.model_folder}")
+    LOG.info(f"Loading model at {args.model_folder}")
+    LOG.info(f"Write sequences to {output_folder_name}")
+    LOG.info(f"Context window {args.context_window}")
+    LOG.info(f"Max sequence allowed {max_seq_allowed}")
+    LOG.info(f"Temperature {args.temperature}")
+    LOG.info(f"Repetition Penalty {args.repetition_penalty}")
+    LOG.info(f"Sampling Strategy {args.sampling_strategy}")
+    LOG.info(f"Num beam {args.num_beams}")
+    LOG.info(f"Num beam groups {args.num_beam_groups}")
+    LOG.info(f"Epsilon cutoff {args.epsilon_cutoff}")
+    LOG.info(f"Top P {args.top_p}")
+    LOG.info(f"Top K {args.top_k}")
+    LOG.info(f"Loading demographic_info at {args.demographic_data_path}")
+    dataset = load_parquet_as_dataset(args.demographic_data_path)
+    total_rows = len(dataset)
+    num_of_batches = args.num_of_patients // args.batch_size + 1
+    sequence_to_flush = []
+    current_person_id = 1
+    for i in range(num_of_batches):
+        LOG.info(f"{datetime.datetime.now()}: Batch {i} started")
+        # Randomly pick demographics from the existing population
+        random_prompts = []
+        iter = 0
+        while len(random_prompts) < args.batch_size:
+            for row in dataset.select(
+                random.sample(range(total_rows), k=args.batch_size)
+            ):
+                if (
+                    args.min_num_of_concepts
+                    <= len(row["concept_ids"])
+                    <= max_seq_allowed
+                ):
+                    random_prompts.append(
+                        cehrgpt_tokenizer.encode(row["concept_ids"][:START_TOKEN_SIZE])
+                    )
+                iter += 1
+                if not random_prompts and iter > 10:
+                    raise RuntimeError(
+                        f"The length of concept_ids in {args.demographic_data_path} does not qualify!"
+                    )
+        # Make sure the batch does not exceed batch_size
+        batch_sequences = generate_single_batch(
+            cehrgpt_model,
+            cehrgpt_tokenizer,
+            random_prompts[: args.batch_size],
+            max_new_tokens=args.context_window,
+            mini_num_of_concepts=args.min_num_of_concepts,
+            top_p=args.top_p,
+            top_k=args.top_k,
+            temperature=args.temperature,
+            repetition_penalty=args.repetition_penalty,
+            num_beams=args.num_beams,
+            num_beam_groups=args.num_beam_groups,
+            epsilon_cutoff=args.epsilon_cutoff,
+            device=device,
+        )
+        # Clear the cache
+        torch.cuda.empty_cache()
+        for concept_ids, value_indicators, values in zip(
+            batch_sequences["sequences"],
+            batch_sequences["value_indicators"],
+            batch_sequences["values"],
+        ):
+            (
+                concept_ids,
+                is_numeric_types,
+                number_as_values,
+                concept_as_values,
+                units,
+            ) = normalize_value(concept_ids, values, cehrgpt_tokenizer)
+            output = {"concept_ids": concept_ids, "person_id": current_person_id}
+            if is_numeric_types is not None:
+                output["is_numeric_types"] = is_numeric_types
+            if number_as_values is not None:
+                output["number_as_values"] = number_as_values
+            if concept_as_values is not None:
+                output["concept_as_values"] = concept_as_values
+            if value_indicators is not None:
+                output["concept_value_masks"] = value_indicators
+            if units is not None:
+                output["units"] = units
+            sequence_to_flush.append(output)
+            current_person_id += 1
+        if len(sequence_to_flush) >= args.buffer_size:
+            LOG.info(f"{datetime.datetime.now()}: Flushing to the Disk at Batch {i}")
+            pd.DataFrame(
+                sequence_to_flush,
+                columns=[
+                    "concept_ids",
+                    "person_id",
+                    "is_numeric_types",
+                    "number_as_values",
+                    "concept_as_values",
+                    "concept_value_masks",
+                    "units",
+                ],
+            ).to_parquet(os.path.join(output_folder_name, f"{uuid.uuid4()}.parquet"))
+            sequence_to_flush.clear()
+    if len(sequence_to_flush) > 0:
+        LOG.info(f"{datetime.datetime.now()}: Flushing to the Disk at Final Batch")
+        pd.DataFrame(
+            sequence_to_flush,
+            columns=[
+                "concept_ids",
+                "person_id",
+                "is_numeric_types",
+                "number_as_values",
+                "concept_as_values",
+                "concept_value_masks",
+                "units",
+            ],
+        ).to_parquet(os.path.join(output_folder_name, f"{uuid.uuid4()}-last.parquet"))
+def create_arg_parser():
+    base_arg_parser = create_inference_base_arg_parser(
+        description="Arguments for generating patient sequences"
+    )
+    base_arg_parser.add_argument(
+        "--num_of_patients",
+        dest="num_of_patients",
+        action="store",
+        type=int,
+        help="The number of patients that will be generated",
+        required=True,
+    )
+    base_arg_parser.add_argument(
+        "--demographic_data_path",
+        dest="demographic_data_path",
+        action="store",
+        help="The path for your concept_path",
+        required=True,
+    )
+    base_arg_parser.add_argument(
+        "--drop_long_sequences",
+        dest="drop_long_sequences",
+        action="store_true",
+    )
+    return base_arg_parser
+if __name__ == "__main__":
+    main(create_arg_parser().parse_args())