PyPI - cehrgpt - Versions diffs - 0.0.2__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

cehrgpt 0.0.2py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

cehrgpt/analysis/irregularity.py +36 -0
cehrgpt/data/hf_cehrgpt_dataset.py +25 -4
cehrgpt/data/hf_cehrgpt_dataset_collator.py +635 -97
cehrgpt/data/hf_cehrgpt_dataset_mapping.py +308 -95
cehrgpt/data/sample_packing_sampler.py +181 -0
cehrgpt/generation/generate_batch_hf_gpt_sequence.py +12 -9
cehrgpt/generation/omop_converter_batch.py +32 -2
cehrgpt/gpt_utils.py +20 -2
cehrgpt/models/config.py +35 -0
cehrgpt/models/hf_cehrgpt.py +470 -106
cehrgpt/models/hf_modeling_outputs.py +1 -0
cehrgpt/models/special_tokens.py +1 -0
cehrgpt/models/tokenization_hf_cehrgpt.py +358 -71
cehrgpt/runners/data_utils.py +358 -0
cehrgpt/runners/gpt_runner_util.py +0 -10
cehrgpt/runners/hf_cehrgpt_finetune_runner.py +181 -283
cehrgpt/runners/hf_cehrgpt_pretrain_runner.py +288 -112
cehrgpt/runners/hf_gpt_runner_argument_dataclass.py +90 -0
cehrgpt/runners/hyperparameter_search_util.py +10 -8
cehrgpt/runners/sample_packing_trainer.py +185 -0
cehrgpt/simulations/generate_plots.py +95 -0
cehrgpt/simulations/run_simulation.sh +24 -0
cehrgpt/simulations/time_embedding_simulation.py +250 -0
cehrgpt/simulations/time_token_simulation.py +177 -0
cehrgpt/time_to_event/config/1_year_cabg.yaml +23 -0
cehrgpt/time_to_event/time_to_event_model.py +2 -13
cehrgpt/time_to_event/time_to_event_prediction.py +27 -13
cehrgpt/tools/linear_prob/__init__.py +0 -0
cehrgpt/tools/linear_prob/compute_cehrgpt_features.py +495 -0
cehrgpt/tools/linear_prob/train_with_cehrgpt_features.py +152 -0
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info}/METADATA +11 -8
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info}/RECORD +36 -32
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info}/WHEEL +1 -1
cehrgpt/data/hf_cehrgpt_dpo_collator.py +0 -71
cehrgpt/data/hf_cehrgpt_dpo_dataset_mapping.py +0 -61
cehrgpt/generation/generate_paired_cehrgpt_sequence.py +0 -224
cehrgpt/rl_finetune/cehrgpt_dpo_trainer.py +0 -586
cehrgpt/rl_finetune/cehrgpt_ppo_trainer.py +0 -464
cehrgpt/rl_finetune/ppo_finetune.py +0 -394
cehrgpt/rl_finetune/ppo_finetune_v2.py +0 -373
cehrgpt/runners/hf_cehrgpt_dpo_runner.py +0 -119
/cehrgpt/{rl_finetune → simulations}/__init__.py +0 -0
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info/licenses}/LICENSE +0 -0
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info}/top_level.txt +0 -0

cehrgpt/data/hf_cehrgpt_dataset_mapping.py CHANGED Viewed

@@ -1,19 +1,33 @@
 import datetime
-from typing import Any, Dict
+from collections import defaultdict
+from typing import Any, Dict, Generator, List, Optional, Union
 import numpy as np
 import pandas as pd
+from cehrbert.data_generators.hf_data_generator import UNKNOWN_VALUE
 from cehrbert.data_generators.hf_data_generator.hf_dataset_mapping import (
     ED_VISIT_TYPE_CODES,
     INPATIENT_VISIT_TYPE_CODES,
     INPATIENT_VISIT_TYPES,
     DatasetMapping,
+    VisitObject,
+    get_value,
+    has_events_and_get_events,
     replace_escape_chars,
 )
+from cehrbert.med_extension.schema_extension import Event
 from cehrbert.runners.hf_runner_argument_dataclass import DataTrainingArguments
+from cehrbert_data.const.artificial_tokens import (
+    DISCHARGE_UNKNOWN_TOKEN,
+    GENDER_UNKNOWN_TOKEN,
+    RACE_UNKNOWN_TOKEN,
+    VISIT_UNKNOWN_TOKEN,
+)
 from cehrbert_data.const.common import NA
 from cehrbert_data.decorators.patient_event_decorator_base import get_att_function
+from datasets.formatting.formatting import LazyBatch
 from dateutil.relativedelta import relativedelta
+from pandas import Series
 from cehrgpt.models.tokenization_hf_cehrgpt import (
     NONE_BIN,
@@ -21,6 +35,17 @@ from cehrgpt.models.tokenization_hf_cehrgpt import (
     CehrGptTokenizer,
 )
+CEHRGPT_COLUMNS = [
+    "concept_ids",
+    "concept_value_masks",
+    "number_as_values",
+    "concept_as_values",
+    "is_numeric_types",
+    "concept_values",
+    "units",
+    "epoch_times",
+]
 def convert_date_to_posix_time(index_date: datetime.date) -> float:
     return datetime.datetime.combine(
@@ -28,11 +53,39 @@ def convert_date_to_posix_time(index_date: datetime.date) -> float:
     ).timestamp()
-class MedToCehrGPTDatasetMapping(DatasetMapping):
+class DatasetMappingDecorator(DatasetMapping):
+    def batch_transform(
+        self, records: Union[LazyBatch, Dict[str, Any]]
+    ) -> List[Dict[str, Any]]:
+        """
+        Drop index_date if it contains None.
+        :param records:
+        :return:
+        """
+        if isinstance(records, LazyBatch):
+            table = records.pa_table
+            if "index_date" in table.column_names:
+                index_col = table.column("index_date")
+                if index_col.null_count > 0:
+                    table = table.drop(["index_date"])
+            records = LazyBatch(pa_table=table, formatter=records.formatter)
+        else:
+            if "index_date" in records:
+                if pd.isna(records["index_date"][0]):
+                    del records["index_date"]
+        return super().batch_transform(records=records)
+    def transform(self, record: Dict[str, Any]) -> Union[Dict[str, Any], Series]:
+        raise NotImplemented("Must be implemented")
+class MedToCehrGPTDatasetMapping(DatasetMappingDecorator):
     def __init__(
         self,
         data_args: DataTrainingArguments,
-        is_pretraining: bool = True,
         include_inpatient_hour_token: bool = True,
     ):
         self._time_token_function = get_att_function(data_args.att_function_type)
@@ -41,7 +94,6 @@ class MedToCehrGPTDatasetMapping(DatasetMapping):
             data_args.inpatient_att_function_type
         )
         self._include_demographic_prompt = data_args.include_demographic_prompt
-        self._is_pretraining = is_pretraining
         self._include_inpatient_hour_token = include_inpatient_hour_token
     """
@@ -57,19 +109,13 @@ class MedToCehrGPTDatasetMapping(DatasetMapping):
     """
     def remove_columns(self):
-        if self._is_pretraining:
-            return ["visits", "birth_datetime", "index_date"]
-        else:
-            return [
-                "visits",
-                "birth_datetime",
-                "visit_concept_ids",
-            ]
+        return ["patient_id", "visits", "birth_datetime"]
     @staticmethod
     def _update_cehrgpt_record(
         cehrgpt_record: Dict[str, Any],
         code: str,
+        time: datetime.datetime,
         concept_value_mask: int = 0,
         number_as_value: float = 0.0,
         concept_as_value: str = "0",
@@ -82,6 +128,7 @@ class MedToCehrGPTDatasetMapping(DatasetMapping):
         cehrgpt_record["concept_as_values"].append(concept_as_value)
         cehrgpt_record["units"].append(unit)
         cehrgpt_record["is_numeric_types"].append(is_numeric_type)
+        cehrgpt_record["epoch_times"].append(time.timestamp())
     def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
         cehrgpt_record = {
@@ -92,45 +139,57 @@ class MedToCehrGPTDatasetMapping(DatasetMapping):
             "concept_as_values": [],
             "units": [],
             "is_numeric_types": [],
+            "epoch_times": [],
         }
         # Extract the demographic information
         birth_datetime = record["birth_datetime"]
         if isinstance(birth_datetime, pd.Timestamp):
             birth_datetime = birth_datetime.to_pydatetime()
         gender = record["gender"]
+        gender = GENDER_UNKNOWN_TOKEN if gender == UNKNOWN_VALUE else gender
         race = record["race"]
+        race = RACE_UNKNOWN_TOKEN if race == UNKNOWN_VALUE else race
+        visits = record["visits"]
+        # This indicates this is columnar format
+        if isinstance(visits, dict):
+            visits = sorted(self.convert_visit_columnar_to_python(visits))
+        else:
+            visits = sorted(visits, key=lambda _: get_value(_, "visit_start_datetime"))
         # Add the demographic tokens
-        first_visit = record["visits"][0]
-        year_str = f'year:{str(first_visit["visit_start_datetime"].year)}'
-        age_str = f'age:{str(relativedelta(first_visit["visit_start_datetime"], birth_datetime).years)}'
-        self._update_cehrgpt_record(cehrgpt_record, year_str)
-        self._update_cehrgpt_record(cehrgpt_record, age_str)
-        self._update_cehrgpt_record(cehrgpt_record, gender)
-        self._update_cehrgpt_record(cehrgpt_record, race)
+        first_visit = visits[0]
+        first_visit_start_datetime: datetime.datetime = get_value(
+            first_visit, "visit_start_datetime"
+        )
+        year_str = f"year:{str(first_visit_start_datetime.year)}"
+        age_str = f"age:{str(relativedelta(first_visit_start_datetime, birth_datetime).years)}"
+        self._update_cehrgpt_record(
+            cehrgpt_record, year_str, first_visit_start_datetime
+        )
+        self._update_cehrgpt_record(cehrgpt_record, age_str, first_visit_start_datetime)
+        self._update_cehrgpt_record(cehrgpt_record, gender, first_visit_start_datetime)
+        self._update_cehrgpt_record(cehrgpt_record, race, first_visit_start_datetime)
         # Use a data cursor to keep track of time
-        date_cursor = None
-        # Loop through all the visits excluding the first event containing the demographics
-        for i, visit in enumerate(
-            sorted(record["visits"], key=lambda e: e["visit_start_datetime"])
-        ):
-            events = visit["events"]
-            # Skip this visit if the number measurements in the event is zero
-            if events is None or len(events) == 0:
+        datetime_cursor: Optional[datetime.datetime] = None
+        visit: VisitObject
+        # Loop through all the visits
+        for i, visit in enumerate(visits):
+            events: Generator[Event, None, None] = get_value(visit, "events")
+            has_events, events = has_events_and_get_events(events)
+            if not has_events:
                 continue
-            visit_start_datetime = visit["visit_start_datetime"]
-            time_delta = (
-                (visit_start_datetime - date_cursor).days if date_cursor else None
+            visit_start_datetime: datetime.datetime = get_value(
+                visit, "visit_start_datetime"
+            )
+            # If visit_end_datetime is populated for the inpatient visit, we update the datetime_cursor
+            visit_end_datetime: Optional[datetime.datetime] = get_value(
+                visit, "visit_end_datetime"
             )
-            date_cursor = visit_start_datetime
             # We assume the first measurement to be the visit type of the current visit
-            visit_type = visit["visit_type"]
+            visit_type = get_value(visit, "visit_type")
             is_er_or_inpatient = (
                 visit_type in INPATIENT_VISIT_TYPES
                 or visit_type in INPATIENT_VISIT_TYPE_CODES
@@ -138,36 +197,45 @@ class MedToCehrGPTDatasetMapping(DatasetMapping):
             )
             # Add artificial time tokens to the patient timeline if timedelta exists
-            if time_delta is not None:
+            if datetime_cursor is not None:
+                time_delta = max((visit_start_datetime - datetime_cursor).days, 0)
                 # This generates an artificial time token depending on the choice of the time token functions
                 self._update_cehrgpt_record(
                     cehrgpt_record,
                     code=self._time_token_function(time_delta),
+                    time=visit_start_datetime,
                 )
-            # Add the VS token to the patient timeline to mark the start of a visit
-            relativedelta(visit["visit_start_datetime"], birth_datetime).years
-            # Calculate the week number since the epoch time
-            date = (
-                visit["visit_start_datetime"]
-                - datetime.datetime(year=1970, month=1, day=1)
-            ).days // 7
+            datetime_cursor = visit_start_datetime
             # Add a [VS] token
             self._update_cehrgpt_record(
                 cehrgpt_record,
                 code="[VS]",
+                time=datetime_cursor,
             )
             # Add a visit type token
             self._update_cehrgpt_record(
                 cehrgpt_record,
                 code=visit_type,
+                time=datetime_cursor,
             )
+            # We need to insert an inpatient hour token right after the visit type, we calculate the hour interval
+            # with respect to the midnight of the day
+            if is_er_or_inpatient and self._include_inpatient_hour_token:
+                if datetime_cursor.hour > 0:
+                    # This generates an artificial time token depending on the choice of the time token functions
+                    self._update_cehrgpt_record(
+                        cehrgpt_record,
+                        code=f"i-H{datetime_cursor.hour}",
+                        time=datetime_cursor,
+                    )
             # Keep track of the existing outpatient events, we don't want to add them again
-            existing_outpatient_events = list()
+            existing_duplicate_events = list()
             for e in events:
                 # If the event doesn't have a time stamp, we skip it
-                if not e["time"]:
+                event_time: datetime.datetime = e["time"]
+                if not event_time:
                     continue
                 # If numeric_value exists, this is a concept/value tuple, we indicate this using a concept_value_mask
@@ -178,47 +246,62 @@ class MedToCehrGPTDatasetMapping(DatasetMapping):
                 concept_value_mask = int(
                     numeric_value is not None or text_value is not None
                 )
+                if numeric_value is None and text_value is not None:
+                    if text_value.isnumeric():
+                        numeric_value = float(text_value)
                 is_numeric_type = int(numeric_value is not None)
                 code = replace_escape_chars(e["code"])
+                # Create the event identity
+                event_identity = (
+                    (event_time, code, text_value, unit)
+                    if is_er_or_inpatient
+                    else (event_time.date(), code, text_value, unit)
+                )
                 # Add a medical token to the patient timeline
                 # If this is an inpatient visit, we use the event time stamps to calculate age and date
                 # because the patient can stay in the hospital for a period of time.
                 if is_er_or_inpatient:
-                    # Calculate the week number since the epoch time
-                    date = (
-                        e["time"] - datetime.datetime(year=1970, month=1, day=1)
-                    ).days // 7
                     # Calculate the time diff in days w.r.t the previous measurement
-                    meas_time_diff = (e["time"] - date_cursor).days
-                    # Update the date_cursor if the time diff between two neighboring measurements is greater than and
+                    time_diff_days = (event_time - datetime_cursor).days
+                    # Update the datetime_cursor if the time diff between two neighboring measurements is greater than and
                     # equal to 1 day
-                    if meas_time_diff > 0:
-                        date_cursor = e["time"]
-                        if self._inpatient_time_token_function:
+                    if self._inpatient_time_token_function and time_diff_days > 0:
+                        # This generates an artificial time token depending on the choice of the time token functions
+                        self._update_cehrgpt_record(
+                            cehrgpt_record,
+                            code=f"i-{self._inpatient_time_token_function(time_diff_days)}",
+                            time=event_time,
+                        )
+                    if self._include_inpatient_hour_token:
+                        # if the time difference in days is greater than 0, we calculate the hour interval
+                        # with respect to the midnight of the day
+                        time_diff_hours = (
+                            event_time.hour
+                            if time_diff_days > 0
+                            else int(
+                                (event_time - datetime_cursor).total_seconds() // 3600
+                            )
+                        )
+                        if time_diff_hours > 0:
                             # This generates an artificial time token depending on the choice of the time token functions
                             self._update_cehrgpt_record(
                                 cehrgpt_record,
-                                code=f"i-{self._inpatient_time_token_function(meas_time_diff)}",
+                                code=f"i-H{time_diff_hours}",
+                                time=event_time,
                             )
-                else:
-                    # For outpatient visits, we use the visit time stamp to calculate age and time because we assume
-                    # the outpatient visits start and end on the same day.
-                    # We check whether the date/code/value combination already exists in the existing events
-                    # If they exist, we do not add them to the patient timeline for outpatient visits.
-                    if (
-                        date,
-                        code,
-                        numeric_value,
-                        text_value,
-                        concept_value_mask,
-                        numeric_value,
-                    ) in existing_outpatient_events:
-                        continue
+                if event_identity in existing_duplicate_events:
+                    continue
                 self._update_cehrgpt_record(
                     cehrgpt_record,
                     code=code,
+                    time=event_time,
                     concept_value_mask=concept_value_mask,
                     unit=unit,
                     number_as_value=numeric_value if numeric_value else 0.0,
@@ -227,43 +310,44 @@ class MedToCehrGPTDatasetMapping(DatasetMapping):
                     ),
                     is_numeric_type=is_numeric_type,
                 )
-                existing_outpatient_events.append(
-                    (
-                        date,
-                        code,
-                        numeric_value,
-                        text_value,
-                        concept_value_mask,
-                        numeric_value,
-                    )
-                )
+                existing_duplicate_events.append(event_identity)
+                # we only want to update the time stamp when data_cursor is less than the event time
+                if datetime_cursor < event_time or datetime_cursor is None:
+                    datetime_cursor = event_time
+                    # We need to bound the datetime_cursor if the current visit is an admission type of visit
+                    # as the associated events could be generated after the visits are complete
+                    if is_er_or_inpatient and visit_end_datetime is not None:
+                        datetime_cursor = min(datetime_cursor, visit_end_datetime)
             # For inpatient or ER visits, we want to discharge_facility to the end of the visit
             if is_er_or_inpatient:
-                # If visit_end_datetime is populated for the inpatient visit, we update the date_cursor
-                visit_end_datetime = visit.get("visit_end_datetime", None)
-                if visit_end_datetime:
-                    date_cursor = visit_end_datetime
+                # If visit_end_datetime is populated for the inpatient visit, we update the datetime_cursor
+                if visit_end_datetime is not None:
+                    datetime_cursor = visit_end_datetime
                 if self._include_auxiliary_token:
                     # Reuse the age and date calculated for the last event in the patient timeline for the discharge
                     # facility event
-                    discharge_facility = (
-                        visit["discharge_facility"]
-                        if ("discharge_facility" in visit)
-                        and visit["discharge_facility"]
-                        else "0"
-                    )
+                    discharge_facility = get_value(visit, "discharge_facility")
+                    if not discharge_facility:
+                        discharge_facility = DISCHARGE_UNKNOWN_TOKEN
+                    else:
+                        discharge_facility = (
+                            DISCHARGE_UNKNOWN_TOKEN
+                            if discharge_facility == UNKNOWN_VALUE
+                            else discharge_facility
+                        )
                     self._update_cehrgpt_record(
                         cehrgpt_record,
                         code=discharge_facility,
+                        time=datetime_cursor,
                     )
             # Reuse the age and date calculated for the last event in the patient timeline
             self._update_cehrgpt_record(
                 cehrgpt_record,
                 code="[VE]",
+                time=datetime_cursor,
             )
         # Generate the orders of the concepts that the cehrbert dataset mapping function expects
@@ -273,17 +357,23 @@ class MedToCehrGPTDatasetMapping(DatasetMapping):
         # Add some count information for this sequence
         cehrgpt_record["num_of_concepts"] = len(cehrgpt_record["concept_ids"])
-        cehrgpt_record["num_of_visits"] = len(record["visits"])
+        cehrgpt_record["num_of_visits"] = len(visits)
-        if "label" in record:
+        if record.get("index_date", None) is not None:
+            cehrgpt_record["index_date"] = record["index_date"]
+        if record.get("label", None) is not None:
             cehrgpt_record["label"] = record["label"]
-        if "age_at_index" in record:
+        if record.get("age_at_index", None) is not None:
             cehrgpt_record["age_at_index"] = record["age_at_index"]
+        assert len(cehrgpt_record["epoch_times"]) == len(
+            cehrgpt_record["concept_ids"]
+        ), "The number of time stamps must match with the number of concepts in the sequence"
         return cehrgpt_record
-class HFCehrGptTokenizationMapping(DatasetMapping):
+class HFCehrGptTokenizationMapping(DatasetMappingDecorator):
     def __init__(
         self,
         concept_tokenizer: CehrGptTokenizer,
@@ -297,9 +387,46 @@ class HFCehrGptTokenizationMapping(DatasetMapping):
             "is_numeric_types",
         ]
+    def filter_out_invalid_tokens(self, record: Dict[str, Any]) -> Dict[str, Any]:
+        column_names = []
+        seq_length = len(record["concept_ids"])
+        # We can't have "0" as a token in the tokenizer because it would break tokenization for "Race/0", "Visit/0"
+        # This is a pre-caution
+        if "0" in record["concept_ids"]:
+            if isinstance(record["concept_ids"], np.ndarray):
+                record["concept_ids"][record["concept_ids"] == "0"] = "Unknown"
+            else:
+                record["concept_ids"] = [
+                    "Unknown" if x == "0" else x for x in record["concept_ids"]
+                ]
+        for k, v in record.items():
+            if k not in CEHRGPT_COLUMNS:
+                continue
+            if isinstance(v, (list, np.ndarray)) and len(v) == seq_length:
+                column_names.append(k)
+        valid_concept_ids = self._concept_tokenizer.get_vocab().keys()
+        valid_indices = [
+            idx
+            for idx, concept_id in enumerate(record["concept_ids"])
+            if concept_id in valid_concept_ids
+        ]
+        if len(valid_indices) != len(record["concept_ids"]):
+            for column in column_names:
+                values = record[column]
+                record[column] = [values[idx] for idx in valid_indices]
+        return record
     def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
+        # Remove the tokens from patient sequences that do not exist in the tokenizer
+        record = self.filter_out_invalid_tokens(record)
         # If any concept has a value associated with it, we normalize the value
         record["input_ids"] = self._concept_tokenizer.encode(record["concept_ids"])
+        assert len(record["input_ids"]) == len(record["concept_ids"]), (
+            "The number of tokens must equal to the number of concepts\n"
+            f"decoded concept_ids: {self._concept_tokenizer.decode(record['input_ids'], skip_special_tokens=False)}"
+        )
         record["value_indicators"] = record["concept_value_masks"]
         if "number_as_values" not in record or "concept_as_values" not in record:
             record["number_as_values"] = [
@@ -380,3 +507,89 @@ class HFFineTuningMapping(HFCehrGptTokenizationMapping):
         columns = super().remove_columns()
         columns.append("label")
         return columns
+class ExtractTokenizedSequenceDataMapping:
+    def __init__(
+        self,
+        person_index_date_map: Dict[int, List[Dict[str, Any]]],
+        observation_window: int = 0,
+    ):
+        self.person_index_date_map = person_index_date_map
+        self.observation_window = observation_window
+    def _calculate_prediction_start_time(self, prediction_time: float):
+        if self.observation_window and self.observation_window > 0:
+            return max(prediction_time - self.observation_window * 24 * 3600, 0)
+        return 0
+    def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
+        person_id = record["person_id"]
+        prediction_times = self.person_index_date_map[person_id]
+        prediction_start_end_times = [
+            (
+                self._calculate_prediction_start_time(
+                    prediction_time_label_map["index_date"].timestamp()
+                ),
+                prediction_time_label_map["index_date"].timestamp(),
+                prediction_time_label_map["label"],
+            )
+            for prediction_time_label_map in prediction_times
+        ]
+        observation_window_indices = np.zeros(
+            (len(prediction_times), len(record["epoch_times"])), dtype=bool
+        )
+        for i, epoch_time in enumerate(record["epoch_times"]):
+            for sample_n, (
+                feature_extraction_time_start,
+                feature_extraction_end_end,
+                _,
+            ) in enumerate(prediction_start_end_times):
+                if (
+                    feature_extraction_time_start
+                    <= epoch_time
+                    <= feature_extraction_end_end
+                ):
+                    observation_window_indices[sample_n][i] = True
+        seq_length = len(record["epoch_times"])
+        time_series_columns = ["concept_ids", "input_ids"]
+        static_inputs = dict()
+        for k, v in record.items():
+            if k in ["concept_ids", "input_ids"]:
+                continue
+            if isinstance(v, (list, np.ndarray)) and len(v) == seq_length:
+                time_series_columns.append(k)
+            else:
+                static_inputs[k] = v
+        batched_samples = defaultdict(list)
+        for (_, index_date, label), observation_window_index in zip(
+            prediction_start_end_times, observation_window_indices
+        ):
+            for k, v in static_inputs.items():
+                batched_samples[k].append(v)
+            batched_samples["classifier_label"].append(label)
+            batched_samples["index_date"].append(index_date)
+            try:
+                start_age = int(record["concept_ids"][1].split(":")[1])
+            except Exception:
+                start_age = -1
+            batched_samples["age_at_index"].append(start_age)
+            for time_series_column in time_series_columns:
+                batched_samples[time_series_column].append(
+                    np.asarray(record[time_series_column])[observation_window_index]
+                )
+        return batched_samples
+    def batch_transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
+        all_batched_record = defaultdict(list)
+        all_columns = record.keys()
+        for i in range(len(record["concept_ids"])):
+            one_record = {}
+            for column in all_columns:
+                one_record[column] = record[column][i]
+            new_batched_record = self.transform(one_record)
+            for k, v in new_batched_record.items():
+                all_batched_record[k].extend(v)
+        return all_batched_record

cehrgpt 0.0.2__py3-none-any.whl → 0.1.1__py3-none-any.whl

cehrgpt 0.0.2py3-none-any.whl → 0.1.1py3-none-any.whl