PyPI - cehrgpt - Versions diffs - 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl - Mend

cehrgpt 0.0.1py3-none-any.whl → 0.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

cehrgpt/data/hf_cehrgpt_dataset_mapping.py +267 -1
cehrgpt/data/hf_cehrgpt_dpo_collator.py +71 -0
cehrgpt/data/hf_cehrgpt_dpo_dataset_mapping.py +61 -0
cehrgpt/generation/generate_paired_cehrgpt_sequence.py +224 -0
cehrgpt/generation/omop_converter_batch.py +3 -0
cehrgpt/models/hf_cehrgpt.py +1 -0
cehrgpt/models/tokenization_hf_cehrgpt.py +2 -2
cehrgpt/rl_finetune/__init__.py +0 -0
cehrgpt/rl_finetune/cehrgpt_dpo_trainer.py +586 -0
cehrgpt/rl_finetune/cehrgpt_ppo_trainer.py +464 -0
cehrgpt/rl_finetune/ppo_finetune.py +394 -0
cehrgpt/rl_finetune/ppo_finetune_v2.py +373 -0
cehrgpt/runners/hf_cehrgpt_dpo_runner.py +119 -0
cehrgpt/runners/hf_cehrgpt_finetune_runner.py +24 -3
cehrgpt/runners/hf_cehrgpt_pretrain_runner.py +44 -8
cehrgpt/runners/hf_gpt_runner_argument_dataclass.py +4 -0
cehrgpt/tools/generate_causal_patient_split_by_age.py +146 -0
{cehrgpt-0.0.1.dist-info → cehrgpt-0.0.2.dist-info}/METADATA +52 -6
{cehrgpt-0.0.1.dist-info → cehrgpt-0.0.2.dist-info}/RECORD +22 -12
{cehrgpt-0.0.1.dist-info → cehrgpt-0.0.2.dist-info}/WHEEL +1 -1
{cehrgpt-0.0.1.dist-info → cehrgpt-0.0.2.dist-info}/LICENSE +0 -0
{cehrgpt-0.0.1.dist-info → cehrgpt-0.0.2.dist-info}/top_level.txt +0 -0

cehrgpt/data/hf_cehrgpt_dataset_mapping.py CHANGED Viewed

@@ -2,7 +2,18 @@ import datetime
 from typing import Any, Dict
 import numpy as np
-from cehrbert.data_generators.hf_data_generator.hf_dataset_mapping import DatasetMapping
+import pandas as pd
+from cehrbert.data_generators.hf_data_generator.hf_dataset_mapping import (
+    ED_VISIT_TYPE_CODES,
+    INPATIENT_VISIT_TYPE_CODES,
+    INPATIENT_VISIT_TYPES,
+    DatasetMapping,
+    replace_escape_chars,
+)
+from cehrbert.runners.hf_runner_argument_dataclass import DataTrainingArguments
+from cehrbert_data.const.common import NA
+from cehrbert_data.decorators.patient_event_decorator_base import get_att_function
+from dateutil.relativedelta import relativedelta
 from cehrgpt.models.tokenization_hf_cehrgpt import (
     NONE_BIN,
@@ -17,6 +28,261 @@ def convert_date_to_posix_time(index_date: datetime.date) -> float:
     ).timestamp()
+class MedToCehrGPTDatasetMapping(DatasetMapping):
+    def __init__(
+        self,
+        data_args: DataTrainingArguments,
+        is_pretraining: bool = True,
+        include_inpatient_hour_token: bool = True,
+    ):
+        self._time_token_function = get_att_function(data_args.att_function_type)
+        self._include_auxiliary_token = data_args.include_auxiliary_token
+        self._inpatient_time_token_function = get_att_function(
+            data_args.inpatient_att_function_type
+        )
+        self._include_demographic_prompt = data_args.include_demographic_prompt
+        self._is_pretraining = is_pretraining
+        self._include_inpatient_hour_token = include_inpatient_hour_token
+    """
+    This mapping function converts the MED (https://github.com/Medical-Event-Data-Standard/meds/tree/main) extension
+    to the CehrGPT format. We make several assumptions
+    - The first event contains the demographic information
+    - From the second event onward
+        - the time of the event is visit_start_datetime.
+        - the first measurement contains the code indicating a standard OMOP Visit concept_id (e.g. 9201, 9202)
+        - in case of inpatient visits, the last measurement is assumed to
+            contain the standard OMOP concept id for discharge facilities (e.g 8536)
+        - in case of inpatient visits, datetime_value of the last measurement stores visit_end_datetime
+    """
+    def remove_columns(self):
+        if self._is_pretraining:
+            return ["visits", "birth_datetime", "index_date"]
+        else:
+            return [
+                "visits",
+                "birth_datetime",
+                "visit_concept_ids",
+            ]
+    @staticmethod
+    def _update_cehrgpt_record(
+        cehrgpt_record: Dict[str, Any],
+        code: str,
+        concept_value_mask: int = 0,
+        number_as_value: float = 0.0,
+        concept_as_value: str = "0",
+        is_numeric_type: int = 0,
+        unit: str = NA,
+    ) -> None:
+        cehrgpt_record["concept_ids"].append(replace_escape_chars(code))
+        cehrgpt_record["concept_value_masks"].append(concept_value_mask)
+        cehrgpt_record["number_as_values"].append(number_as_value)
+        cehrgpt_record["concept_as_values"].append(concept_as_value)
+        cehrgpt_record["units"].append(unit)
+        cehrgpt_record["is_numeric_types"].append(is_numeric_type)
+    def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
+        cehrgpt_record = {
+            "person_id": record["patient_id"],
+            "concept_ids": [],
+            "concept_value_masks": [],
+            "number_as_values": [],
+            "concept_as_values": [],
+            "units": [],
+            "is_numeric_types": [],
+        }
+        # Extract the demographic information
+        birth_datetime = record["birth_datetime"]
+        if isinstance(birth_datetime, pd.Timestamp):
+            birth_datetime = birth_datetime.to_pydatetime()
+        gender = record["gender"]
+        race = record["race"]
+        # Add the demographic tokens
+        first_visit = record["visits"][0]
+        year_str = f'year:{str(first_visit["visit_start_datetime"].year)}'
+        age_str = f'age:{str(relativedelta(first_visit["visit_start_datetime"], birth_datetime).years)}'
+        self._update_cehrgpt_record(cehrgpt_record, year_str)
+        self._update_cehrgpt_record(cehrgpt_record, age_str)
+        self._update_cehrgpt_record(cehrgpt_record, gender)
+        self._update_cehrgpt_record(cehrgpt_record, race)
+        # Use a data cursor to keep track of time
+        date_cursor = None
+        # Loop through all the visits excluding the first event containing the demographics
+        for i, visit in enumerate(
+            sorted(record["visits"], key=lambda e: e["visit_start_datetime"])
+        ):
+            events = visit["events"]
+            # Skip this visit if the number measurements in the event is zero
+            if events is None or len(events) == 0:
+                continue
+            visit_start_datetime = visit["visit_start_datetime"]
+            time_delta = (
+                (visit_start_datetime - date_cursor).days if date_cursor else None
+            )
+            date_cursor = visit_start_datetime
+            # We assume the first measurement to be the visit type of the current visit
+            visit_type = visit["visit_type"]
+            is_er_or_inpatient = (
+                visit_type in INPATIENT_VISIT_TYPES
+                or visit_type in INPATIENT_VISIT_TYPE_CODES
+                or visit_type in ED_VISIT_TYPE_CODES
+            )
+            # Add artificial time tokens to the patient timeline if timedelta exists
+            if time_delta is not None:
+                # This generates an artificial time token depending on the choice of the time token functions
+                self._update_cehrgpt_record(
+                    cehrgpt_record,
+                    code=self._time_token_function(time_delta),
+                )
+            # Add the VS token to the patient timeline to mark the start of a visit
+            relativedelta(visit["visit_start_datetime"], birth_datetime).years
+            # Calculate the week number since the epoch time
+            date = (
+                visit["visit_start_datetime"]
+                - datetime.datetime(year=1970, month=1, day=1)
+            ).days // 7
+            # Add a [VS] token
+            self._update_cehrgpt_record(
+                cehrgpt_record,
+                code="[VS]",
+            )
+            # Add a visit type token
+            self._update_cehrgpt_record(
+                cehrgpt_record,
+                code=visit_type,
+            )
+            # Keep track of the existing outpatient events, we don't want to add them again
+            existing_outpatient_events = list()
+            for e in events:
+                # If the event doesn't have a time stamp, we skip it
+                if not e["time"]:
+                    continue
+                # If numeric_value exists, this is a concept/value tuple, we indicate this using a concept_value_mask
+                numeric_value = e.get("numeric_value", None)
+                text_value = e.get("text_value", None)
+                # The unit might be populated with a None value
+                unit = e.get("unit", NA) if e.get("unit", NA) else NA
+                concept_value_mask = int(
+                    numeric_value is not None or text_value is not None
+                )
+                is_numeric_type = int(numeric_value is not None)
+                code = replace_escape_chars(e["code"])
+                # Add a medical token to the patient timeline
+                # If this is an inpatient visit, we use the event time stamps to calculate age and date
+                # because the patient can stay in the hospital for a period of time.
+                if is_er_or_inpatient:
+                    # Calculate the week number since the epoch time
+                    date = (
+                        e["time"] - datetime.datetime(year=1970, month=1, day=1)
+                    ).days // 7
+                    # Calculate the time diff in days w.r.t the previous measurement
+                    meas_time_diff = (e["time"] - date_cursor).days
+                    # Update the date_cursor if the time diff between two neighboring measurements is greater than and
+                    # equal to 1 day
+                    if meas_time_diff > 0:
+                        date_cursor = e["time"]
+                        if self._inpatient_time_token_function:
+                            # This generates an artificial time token depending on the choice of the time token functions
+                            self._update_cehrgpt_record(
+                                cehrgpt_record,
+                                code=f"i-{self._inpatient_time_token_function(meas_time_diff)}",
+                            )
+                else:
+                    # For outpatient visits, we use the visit time stamp to calculate age and time because we assume
+                    # the outpatient visits start and end on the same day.
+                    # We check whether the date/code/value combination already exists in the existing events
+                    # If they exist, we do not add them to the patient timeline for outpatient visits.
+                    if (
+                        date,
+                        code,
+                        numeric_value,
+                        text_value,
+                        concept_value_mask,
+                        numeric_value,
+                    ) in existing_outpatient_events:
+                        continue
+                self._update_cehrgpt_record(
+                    cehrgpt_record,
+                    code=code,
+                    concept_value_mask=concept_value_mask,
+                    unit=unit,
+                    number_as_value=numeric_value if numeric_value else 0.0,
+                    concept_as_value=(
+                        replace_escape_chars(text_value) if text_value else "0"
+                    ),
+                    is_numeric_type=is_numeric_type,
+                )
+                existing_outpatient_events.append(
+                    (
+                        date,
+                        code,
+                        numeric_value,
+                        text_value,
+                        concept_value_mask,
+                        numeric_value,
+                    )
+                )
+            # For inpatient or ER visits, we want to discharge_facility to the end of the visit
+            if is_er_or_inpatient:
+                # If visit_end_datetime is populated for the inpatient visit, we update the date_cursor
+                visit_end_datetime = visit.get("visit_end_datetime", None)
+                if visit_end_datetime:
+                    date_cursor = visit_end_datetime
+                if self._include_auxiliary_token:
+                    # Reuse the age and date calculated for the last event in the patient timeline for the discharge
+                    # facility event
+                    discharge_facility = (
+                        visit["discharge_facility"]
+                        if ("discharge_facility" in visit)
+                        and visit["discharge_facility"]
+                        else "0"
+                    )
+                    self._update_cehrgpt_record(
+                        cehrgpt_record,
+                        code=discharge_facility,
+                    )
+            # Reuse the age and date calculated for the last event in the patient timeline
+            self._update_cehrgpt_record(
+                cehrgpt_record,
+                code="[VE]",
+            )
+        # Generate the orders of the concepts that the cehrbert dataset mapping function expects
+        cehrgpt_record["orders"] = list(
+            range(1, len(cehrgpt_record["concept_ids"]) + 1)
+        )
+        # Add some count information for this sequence
+        cehrgpt_record["num_of_concepts"] = len(cehrgpt_record["concept_ids"])
+        cehrgpt_record["num_of_visits"] = len(record["visits"])
+        if "label" in record:
+            cehrgpt_record["label"] = record["label"]
+        if "age_at_index" in record:
+            cehrgpt_record["age_at_index"] = record["age_at_index"]
+        return cehrgpt_record
 class HFCehrGptTokenizationMapping(DatasetMapping):
     def __init__(
         self,

cehrgpt/data/hf_cehrgpt_dpo_collator.py ADDED Viewed

@@ -0,0 +1,71 @@
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from cehrgpt.data.hf_cehrgpt_dataset_collator import CehrGptDataCollator
+class CehrGptDPODataCollator(CehrGptDataCollator):
+    def create_preference_inputs(self, examples, prefix):
+        batch = {}
+        # Assume that each example in the batch is a dictionary with 'input_ids' and 'attention_mask'
+        batch_input_ids = [
+            self._try_reverse_tensor(
+                self._convert_to_tensor(example[f"{prefix}_input_ids"])
+            )
+            for example in examples
+        ]
+        batch_attention_mask = [
+            self._try_reverse_tensor(
+                torch.ones_like(
+                    self._convert_to_tensor(example[f"{prefix}_input_ids"]),
+                    dtype=torch.float,
+                )
+            )
+            for example in examples
+        ]
+        # Pad sequences to the max length in the batch
+        batch[f"{prefix}_input_ids"] = self._try_reverse_tensor(
+            pad_sequence(
+                batch_input_ids,
+                batch_first=True,
+                padding_value=self.tokenizer.pad_token_id,
+            ).to(torch.int64)
+        )
+        batch[f"{prefix}_attention_mask"] = self._try_reverse_tensor(
+            pad_sequence(batch_attention_mask, batch_first=True, padding_value=0.0)
+        )
+        assert batch[f"{prefix}_input_ids"].shape[1] <= self.max_length
+        assert batch[f"{prefix}_attention_mask"].shape[1] <= self.max_length
+        if self.include_values:
+            batch_value_indicators = [
+                self._try_reverse_tensor(
+                    self._convert_to_tensor(example[f"{prefix}_value_indicators"])
+                )
+                for example in examples
+            ]
+            batch_values = [
+                self._try_reverse_tensor(
+                    self._convert_to_tensor(example[f"{prefix}__values"])
+                )
+                for example in examples
+            ]
+            batch[f"{prefix}_value_indicators"] = self._try_reverse_tensor(
+                pad_sequence(
+                    batch_value_indicators, batch_first=True, padding_value=False
+                )
+            )
+            batch[f"{prefix}_values"] = self._try_reverse_tensor(
+                pad_sequence(batch_values, batch_first=True, padding_value=-1.0)
+            )
+            assert batch[f"{prefix}_value_indicators"].shape[1] <= self.max_length
+            assert batch[f"{prefix}_values"].shape[1] <= self.max_length
+        return batch
+    def __call__(self, examples):
+        batch_chosen = self.create_preference_inputs(examples, "chosen")
+        batch_rejected = self.create_preference_inputs(examples, "rejected")
+        batch_chosen.update(batch_rejected)
+        return batch_chosen

cehrgpt/data/hf_cehrgpt_dpo_dataset_mapping.py ADDED Viewed

@@ -0,0 +1,61 @@
+import copy
+from typing import Any, Dict
+import numpy as np
+from cehrbert.data_generators.hf_data_generator.hf_dataset_mapping import DatasetMapping
+from cehrgpt.models.tokenization_hf_cehrgpt import CehrGptTokenizer
+class HFCehrGptDPOTokenizationMapping(DatasetMapping):
+    def __init__(
+        self,
+        concept_tokenizer: CehrGptTokenizer,
+    ):
+        self._concept_tokenizer = concept_tokenizer
+        self._lab_token_ids = self._concept_tokenizer.lab_token_ids
+    def transform_with_prefix(self, record: Dict[str, Any], prefix) -> Dict[str, Any]:
+        concept_ids = record[f"{prefix}_concept_ids"]
+        input_ids = self._concept_tokenizer.encode(concept_ids)
+        record[f"{prefix}_input_ids"] = input_ids
+        if f"{prefix}_concept_value_masks" in record:
+            concept_value_masks = record[f"{prefix}_concept_value_masks"]
+            concept_values = record[f"{prefix}_concept_values"]
+            # If any concept has a value associated with it, we normalize the value
+            if np.any(np.asarray(concept_value_masks) > 0):
+                units = record[f"{prefix}_units"]
+                normalized_concept_values = copy.deepcopy(concept_values)
+                for i, (
+                    concept_id,
+                    unit,
+                    token_id,
+                    concept_value_mask,
+                    concept_value,
+                ) in enumerate(
+                    zip(
+                        concept_ids,
+                        units,
+                        input_ids,
+                        concept_value_masks,
+                        concept_values,
+                    )
+                ):
+                    if token_id in self._lab_token_ids:
+                        normalized_concept_value = self._concept_tokenizer.normalize(
+                            concept_id, unit, concept_value
+                        )
+                        normalized_concept_values[i] = normalized_concept_value
+                record[f"{prefix}_concept_values"] = normalized_concept_values
+                # Overwrite the column names
+                record[f"{prefix}_value_indicators"] = record[
+                    f"{prefix}_concept_value_masks"
+                ]
+                record[f"{prefix}_values"] = record[f"{prefix}_concept_values"]
+        return record
+    def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
+        record = self.transform_with_prefix(record, prefix="chosen")
+        record.update(self.transform_with_prefix(record, prefix="rejected"))
+        return record

cehrgpt/generation/generate_paired_cehrgpt_sequence.py ADDED Viewed

@@ -0,0 +1,224 @@
+import datetime
+import os
+import random
+import uuid
+import pandas as pd
+import torch
+from cehrbert.runners.runner_util import load_parquet_as_dataset
+from transformers.utils import is_flash_attn_2_available, logging
+from cehrgpt.cehrgpt_args import create_inference_base_arg_parser
+from cehrgpt.generation.generate_batch_hf_gpt_sequence import (
+    generate_single_batch,
+    normalize_value,
+)
+from cehrgpt.gpt_utils import get_cehrgpt_output_folder
+from cehrgpt.models.hf_cehrgpt import CEHRGPT2LMHeadModel
+from cehrgpt.models.tokenization_hf_cehrgpt import CehrGptTokenizer
+LOG = logging.get_logger("transformers")
+def main(args):
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+    else:
+        device = torch.device("cpu")
+    cehrgpt_tokenizer = CehrGptTokenizer.from_pretrained(args.tokenizer_folder)
+    cehrgpt_model = (
+        CEHRGPT2LMHeadModel.from_pretrained(
+            args.model_folder,
+            attn_implementation=(
+                "flash_attention_2" if is_flash_attn_2_available() else "eager"
+            ),
+            torch_dtype=(
+                torch.bfloat16 if is_flash_attn_2_available() else torch.float32
+            ),
+        )
+        .eval()
+        .to(device)
+    )
+    cehrgpt_model.generation_config.pad_token_id = cehrgpt_tokenizer.pad_token_id
+    cehrgpt_model.generation_config.eos_token_id = cehrgpt_tokenizer.end_token_id
+    cehrgpt_model.generation_config.bos_token_id = cehrgpt_tokenizer.end_token_id
+    folder_name = get_cehrgpt_output_folder(args, cehrgpt_tokenizer)
+    output_folder_name = os.path.join(
+        args.output_folder, folder_name, "generated_sequences"
+    )
+    if not os.path.exists(output_folder_name):
+        os.makedirs(output_folder_name)
+    LOG.info(f"Loading tokenizer at {args.model_folder}")
+    LOG.info(f"Loading model at {args.model_folder}")
+    LOG.info(f"Write sequences to {output_folder_name}")
+    LOG.info(f"Context window {args.context_window}")
+    LOG.info(f"Temperature {args.temperature}")
+    LOG.info(f"Repetition Penalty {args.repetition_penalty}")
+    LOG.info(f"Sampling Strategy {args.sampling_strategy}")
+    LOG.info(f"Num beam {args.num_beams}")
+    LOG.info(f"Num beam groups {args.num_beam_groups}")
+    LOG.info(f"Epsilon cutoff {args.epsilon_cutoff}")
+    LOG.info(f"Top P {args.top_p}")
+    LOG.info(f"Top K {args.top_k}")
+    LOG.info(f"Loading sequence_data_path at {args.sequence_data_path}")
+    dataset = load_parquet_as_dataset(args.sequence_data_path)
+    total_rows = len(dataset)
+    float(args.batch_size) / total_rows
+    num_of_batches = args.num_of_patients // args.batch_size + 1
+    sequence_to_flush = []
+    for i in range(num_of_batches):
+        LOG.info(f"{datetime.datetime.now()}: Batch {i} started")
+        sample_data = []
+        while len(sample_data) == 0:
+            random_indices = random.sample(range(total_rows), k=1)
+            for row in dataset.select(random_indices):
+                if 4 <= len(row["concept_ids"]) <= cehrgpt_model.config.n_positions:
+                    sample_data.append(row)
+        prompts = []
+        chosen_responses = []
+        cutoff_frac = random.uniform(0, args.cutoff_frac_max)
+        for row in sample_data:
+            seq_len = len(row["concept_ids"])
+            prompt_len = max(4, int(seq_len * cutoff_frac))
+            prompts.append(cehrgpt_tokenizer.encode(row["concept_ids"][:prompt_len]))
+            chosen_responses.append(
+                {
+                    "person_id": row["person_id"],
+                    "chosen_concept_ids": (
+                        row["concept_ids"] if "concept_ids" in row else None
+                    ),
+                    "chosen_concept_values": (
+                        row["concept_values"] if "concept_values" in row else None
+                    ),
+                    "chosen_concept_value_masks": (
+                        row["concept_value_masks"]
+                        if "concept_value_masks" in row
+                        else None
+                    ),
+                    "chosen_units": row["units"] if "units" in row else None,
+                    "prompt_length": prompt_len,
+                }
+            )
+        batch_sequences = generate_single_batch(
+            cehrgpt_model,
+            cehrgpt_tokenizer,
+            prompts=prompts,
+            max_new_tokens=args.context_window,
+            mini_num_of_concepts=args.min_num_of_concepts,
+            top_p=args.top_p,
+            top_k=args.top_k,
+            temperature=args.temperature,
+            repetition_penalty=args.repetition_penalty,
+            num_beams=args.num_beams,
+            num_beam_groups=args.num_beam_groups,
+            epsilon_cutoff=args.epsilon_cutoff,
+            device=device,
+        )
+        # Clear the cache
+        torch.cuda.empty_cache()
+        for seq, value_indicator, value, chosen_response in zip(
+            batch_sequences["sequences"],
+            batch_sequences["value_indicators"],
+            batch_sequences["values"],
+            chosen_responses,
+        ):
+            output = {"rejected_concept_ids": seq}
+            normalized_values, units = normalize_value(
+                seq, value_indicator, value, cehrgpt_tokenizer
+            )
+            if normalized_values is not None:
+                output["rejected_concept_values"] = normalized_values
+            if value_indicator is not None:
+                output["rejected_concept_value_masks"] = value_indicator
+            if units is not None:
+                output["rejected_units"] = units
+            output.update(chosen_response)
+            sequence_to_flush.append(output)
+        if len(sequence_to_flush) >= args.buffer_size:
+            LOG.info(f"{datetime.datetime.now()}: Flushing to the Disk at Batch {i}")
+            pd.DataFrame(
+                sequence_to_flush,
+                columns=[
+                    "person_id",
+                    "chosen_concept_ids",
+                    "chosen_concept_values",
+                    "chosen_concept_value_masks",
+                    "chosen_units",
+                    "prompt_length",
+                    "rejected_concept_ids",
+                    "rejected_concept_values",
+                    "rejected_concept_value_masks",
+                    "rejected_units",
+                ],
+            ).to_parquet(os.path.join(output_folder_name, f"{uuid.uuid4()}.parquet"))
+            sequence_to_flush.clear()
+    if len(sequence_to_flush) > 0:
+        LOG.info(f"{datetime.datetime.now()}: Flushing to the Disk at Final Batch")
+        pd.DataFrame(
+            sequence_to_flush,
+            columns=[
+                "person_id",
+                "chosen_concept_ids",
+                "chosen_concept_values",
+                "chosen_concept_value_masks",
+                "chosen_units",
+                "prompt_length",
+                "rejected_concept_ids",
+                "rejected_concept_values",
+                "rejected_concept_value_masks",
+                "rejected_units",
+            ],
+        ).to_parquet(os.path.join(output_folder_name, f"{uuid.uuid4()}-last.parquet"))
+def create_arg_parser():
+    base_arg_parser = create_inference_base_arg_parser(
+        description="Arguments for generating paired patient sequences"
+    )
+    base_arg_parser.add_argument(
+        "--num_of_patients",
+        dest="num_of_patients",
+        action="store",
+        type=int,
+        help="The number of patients that will be generated",
+        required=True,
+    )
+    base_arg_parser.add_argument(
+        "--sequence_data_path",
+        dest="sequence_data_path",
+        action="store",
+        help="The path for your sequence data",
+        required=True,
+    )
+    base_arg_parser.add_argument(
+        "--cutoff_frac_max",
+        dest="cutoff_frac_max",
+        action="store",
+        type=float,
+        help="The max fraction of the patient sequences that will be used for prompting",
+        required=False,
+        default=0.5,
+    )
+    base_arg_parser.add_argument(
+        "--num_proc",
+        dest="num_proc",
+        action="store",
+        type=int,
+        required=False,
+        default=1,
+    )
+    return base_arg_parser
+if __name__ == "__main__":
+    main(create_arg_parser().parse_args())

cehrgpt/generation/omop_converter_batch.py CHANGED Viewed

@@ -35,6 +35,7 @@ from cehrgpt.models.tokenization_hf_cehrgpt import END_TOKEN
 # TODO: move these to cehrbert_data
 STOP_TOKENS = ["VE", "[VE]", END_TOKEN]
+OOV = "[OOV]"
 CURRENT_PATH = Path(__file__).parent
 START_TOKEN_SIZE = 4
 ATT_TIME_TOKENS = generate_artificial_time_tokens()
@@ -297,6 +298,8 @@ def gpt_to_omop_converter_batch(
         inpatient_visit_indicator = False
         for event_idx, event in enumerate(clinical_events, 0):
+            if event == OOV:
+                continue
             # For bad sequences, we don't proceed further and break from the for loop
             if bad_sequence:
                 break

cehrgpt/models/hf_cehrgpt.py CHANGED Viewed

@@ -1766,6 +1766,7 @@ class CehrGptForClassification(CEHRGPTPreTrainedModel):
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        **kwargs,
     ) -> CehrGptSequenceClassifierOutput:
         cehrgpt_output = self.cehrgpt(
             input_ids=input_ids,

cehrgpt/models/tokenization_hf_cehrgpt.py CHANGED Viewed

@@ -918,12 +918,12 @@ class CehrGptTokenizer(PreTrainedTokenizer):
         map_statistics_partial = partial(map_statistics, size=SAMPLE_SIZE)
         if data_args.streaming:
+            first_example = next(iter(dataset))
             parts = dataset.map(
                 partial(agg_helper, map_func=map_statistics_partial),
                 batched=True,
                 batch_size=data_args.preprocessing_batch_size,
-                new_fingerprint="invalid",
-                remove_columns=dataset.column_names,
+                remove_columns=first_example.keys(),
             )
         else:
             parts = dataset.map(

cehrgpt/rl_finetune/__init__.py ADDED Viewed

File without changes

cehrgpt 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl

cehrgpt 0.0.1py3-none-any.whl → 0.0.2py3-none-any.whl