PyPI - cehrgpt - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl - Mend

cehrgpt 0.1.1py3-none-any.whl → 0.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

cehrgpt/analysis/htn_treatment_pathway.py +546 -0
cehrgpt/analysis/treatment_pathway/__init__.py +0 -0
cehrgpt/analysis/treatment_pathway/depression_treatment_pathway.py +94 -0
cehrgpt/analysis/treatment_pathway/diabetes_treatment_pathway.py +94 -0
cehrgpt/analysis/treatment_pathway/htn_treatment_pathway.py +94 -0
cehrgpt/analysis/treatment_pathway/treatment_pathway.py +631 -0
cehrgpt/data/cehrgpt_data_processor.py +549 -0
cehrgpt/data/hf_cehrgpt_dataset.py +4 -0
cehrgpt/data/hf_cehrgpt_dataset_collator.py +286 -629
cehrgpt/data/hf_cehrgpt_dataset_mapping.py +60 -14
cehrgpt/generation/cehrgpt_conditional_generation.py +316 -0
cehrgpt/generation/generate_batch_hf_gpt_sequence.py +35 -15
cehrgpt/generation/omop_converter_batch.py +11 -4
cehrgpt/gpt_utils.py +73 -3
cehrgpt/models/activations.py +27 -0
cehrgpt/models/config.py +6 -2
cehrgpt/models/gpt2.py +560 -0
cehrgpt/models/hf_cehrgpt.py +193 -459
cehrgpt/models/tokenization_hf_cehrgpt.py +380 -50
cehrgpt/omop/ontology.py +154 -0
cehrgpt/runners/data_utils.py +17 -6
cehrgpt/runners/hf_cehrgpt_finetune_runner.py +33 -79
cehrgpt/runners/hf_cehrgpt_pretrain_runner.py +48 -44
cehrgpt/runners/hf_gpt_runner_argument_dataclass.py +58 -34
cehrgpt/runners/hyperparameter_search_util.py +180 -69
cehrgpt/runners/sample_packing_trainer.py +11 -2
cehrgpt/tools/linear_prob/compute_cehrgpt_features.py +27 -31
cehrgpt-0.1.3.dist-info/METADATA +238 -0
{cehrgpt-0.1.1.dist-info → cehrgpt-0.1.3.dist-info}/RECORD +33 -22
cehrgpt-0.1.1.dist-info/METADATA +0 -115
/cehrgpt/tools/{merge_synthetic_real_dataasets.py → merge_synthetic_real_datasets.py} +0 -0
{cehrgpt-0.1.1.dist-info → cehrgpt-0.1.3.dist-info}/WHEEL +0 -0
{cehrgpt-0.1.1.dist-info → cehrgpt-0.1.3.dist-info}/licenses/LICENSE +0 -0
{cehrgpt-0.1.1.dist-info → cehrgpt-0.1.3.dist-info}/top_level.txt +0 -0

cehrgpt/runners/hf_gpt_runner_argument_dataclass.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import dataclasses
-from typing import List, Optional
+from typing import List, Literal, Optional
+from cehrgpt.models.gpt2 import ACT2FN
 @dataclasses.dataclass
@@ -12,6 +14,14 @@ class CehrGPTArguments:
             "help": "The path to the tokenized dataset created for the full population"
         },
     )
+    activation_function: Literal[tuple(ACT2FN.keys())] = dataclasses.field(
+        default="gelu_new",
+        metadata={"help": "The activation function to use"},
+    )
+    decoder_mlp: Literal["GPT2MLP", "LlamaMLP"] = dataclasses.field(
+        default="GPT2MLP",
+        metadata={"help": "The decoder MLP architecture"},
+    )
     include_inpatient_hour_token: Optional[bool] = dataclasses.field(
         default=True,
         metadata={"help": "Include inpatient hour token"},
@@ -54,6 +64,14 @@ class CehrGPTArguments:
         default=128,
         metadata={"help": "The number of examples from the training set."},
     )
+    hyperparameter_tuning: Optional[bool] = dataclasses.field(
+        default=False,
+        metadata={"help": "A flag to indicate if we want to do hyperparameter tuning."},
+    )
+    hyperparameter_tuning_is_grid: Optional[bool] = dataclasses.field(
+        default=True,
+        metadata={"help": "A flag to indicate if we want to do hyperparameter tuning."},
+    )
     hyperparameter_tuning_percentage: Optional[float] = dataclasses.field(
         default=0.1,
         metadata={
@@ -66,10 +84,6 @@ class CehrGPTArguments:
             "help": "The number of trails will be use for hyperparameter tuning."
         },
     )
-    hyperparameter_tuning: Optional[bool] = dataclasses.field(
-        default=False,
-        metadata={"help": "A flag to indicate if we want to do hyperparameter tuning."},
-    )
     hyperparameter_batch_sizes: Optional[List[int]] = dataclasses.field(
         default_factory=lambda: [4, 8, 16],
         metadata={"help": "Hyperparameter search batch sizes"},
@@ -78,29 +92,13 @@ class CehrGPTArguments:
         default_factory=lambda: [10],
         metadata={"help": "Hyperparameter search num_train_epochs"},
     )
-    lr_low: Optional[float] = dataclasses.field(
-        default=1e-5,
-        metadata={
-            "help": "The lower bound of the learning rate range for hyperparameter tuning."
-        },
+    hyperparameter_learning_rates: Optional[List[int]] = dataclasses.field(
+        default_factory=lambda: [1e-5],
+        metadata={"help": "Hyperparameter search learning rates"},
     )
-    lr_high: Optional[float] = dataclasses.field(
-        default=5e-5,
-        metadata={
-            "help": "The upper bound of the learning rate range for hyperparameter tuning."
-        },
-    )
-    weight_decays_low: Optional[float] = dataclasses.field(
-        default=1e-3,
-        metadata={
-            "help": "The lower bound of the weight decays range for hyperparameter tuning."
-        },
-    )
-    weight_decays_high: Optional[float] = dataclasses.field(
-        default=1e-2,
-        metadata={
-            "help": "The upper bound of the weight decays range for hyperparameter tuning."
-        },
+    hyperparameter_weight_decays: Optional[List[int]] = dataclasses.field(
+        default_factory=lambda: [1e-2],
+        metadata={"help": "Hyperparameter search learning rates"},
     )
     causal_sfm: Optional[bool] = dataclasses.field(
         default=False,
@@ -168,6 +166,16 @@ class CehrGPTArguments:
             "help": "A threshold to denote how much the specified metric must improve to satisfy early stopping conditions."
         },
     )
+    inner_dim: Optional[int] = dataclasses.field(
+        default=None,
+        metadata={"help": "The dimensionality of the hidden layer"},
+    )
+    apply_rotary: Optional[bool] = dataclasses.field(
+        default=False,
+        metadata={
+            "help": "A flag to indicate whether we want to use rotary encoder layers"
+        },
+    )
     sample_packing: Optional[bool] = dataclasses.field(
         default=False,
         metadata={
@@ -177,12 +185,6 @@ class CehrGPTArguments:
     max_tokens_per_batch: int = dataclasses.field(
         default=16384, metadata={"help": "Maximum number of tokens in each batch"}
     )
-    add_end_token_in_sample_packing: Optional[bool] = dataclasses.field(
-        default=False,
-        metadata={
-            "help": "A flag to indicate whether we want to add end token in sample packing"
-        },
-    )
     include_motor_time_to_event: Optional[bool] = dataclasses.field(
         default=False,
         metadata={
@@ -203,7 +205,17 @@ class CehrGPTArguments:
             "help": "The number of times each motor_num_time_pieces piece has to be"
         },
     )
-    concept_dir: Optional[str] = dataclasses.field(
+    motor_use_ontology: Optional[bool] = dataclasses.field(
+        default=False,
+        metadata={
+            "help": "A flag to indicate whether we want to use motor_use_ontology"
+        },
+    )
+    motor_sampling_probability: Optional[float] = dataclasses.field(
+        default=0.0,
+        metadata={"help": "A flag to indicate whether we want to use sample packing"},
+    )
+    vocab_dir: Optional[str] = dataclasses.field(
         default=None,
         metadata={"help": "The directory where the concept data is stored."},
     )
@@ -229,3 +241,15 @@ class CehrGPTArguments:
             "help": "The probability of negative samples will be included in the training data"
         },
     )
+    num_of_trajectories_per_sample: Optional[int] = dataclasses.field(
+        default=1,
+        metadata={"help": "The number of trajectories per sample"},
+    )
+    generation_input_length: Optional[int] = dataclasses.field(
+        default=1024,
+        metadata={"help": "The length of the input sequence"},
+    )
+    generation_max_new_tokens: Optional[int] = dataclasses.field(
+        default=1024,
+        metadata={"help": "The maximum number of tokens in the generation sequence"},
+    )

cehrgpt/runners/hyperparameter_search_util.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from functools import partial
-from typing import Callable, Tuple
+from typing import Callable, List, Optional, Tuple, Union
 import optuna
 from cehrbert.runners.hf_runner_argument_dataclass import ModelArguments
@@ -64,28 +64,99 @@ class OptunaMetricCallback(TrainerCallback):
             metrics.update({"optuna_best_metric": metrics["eval_loss"]})
-# Define the hyperparameter search space with parameters
-def hp_space(
-    trial: optuna.Trial,
-    lr_range: Tuple[float, float] = (1e-5, 5e-5),
-    batch_sizes=None,
-    weight_decays: Tuple[float, float] = (1e-4, 1e-2),
-    num_train_epochs: Tuple[float, ...] = 10,
-):
-    if batch_sizes is None:
-        batch_sizes = [4, 8]
+def get_suggestion(
+    trial,
+    hyperparameter_name: str,
+    hyperparameters: List[Union[float, int]],
+    is_grid: bool = False,
+) -> Union[float, int]:
+    """
+    Get hyperparameter suggestion based on search mode.
+    Args:
+        trial: Optuna trial object
+        hyperparameter_name: Name of the hyperparameter
+        hyperparameters: List of hyperparameter values
+        is_grid: Whether to use grid search mode
+    Returns:
+        Suggested hyperparameter value
+    Raises:
+        RuntimeError: If Bayesian mode is used with incorrect number of bounds
+    """
+    if is_grid:
+        return trial.suggest_categorical(hyperparameter_name, hyperparameters)
+    # For Bayesian optimization, we need exactly 2 values (lower and upper bounds)
+    if len(hyperparameters) != 2:
+        raise RuntimeError(
+            f"{hyperparameter_name} must contain exactly two values (lower and upper bound) "
+            f"for Bayesian Optimization, but {len(hyperparameters)} values were provided: {hyperparameters}"
+        )
+    # Ensure bounds are sorted
+    lower, upper = sorted(hyperparameters)
+    return trial.suggest_float(hyperparameter_name, lower, upper, log=True)
+def hp_space(trial: optuna.Trial, cehrgpt_args: CehrGPTArguments):
+    """
+    Define the hyperparameter search space.
+    Args:
+        trial: Optuna trial object
+        cehrgpt_args: CehrGPTArguments
+    Returns:
+        Dictionary of hyperparameter suggestions
+    """
+    is_grid = cehrgpt_args.hyperparameter_tuning_is_grid
+    learning_rates = cehrgpt_args.hyperparameter_learning_rates
+    weight_decays = cehrgpt_args.hyperparameter_weight_decays
+    batch_sizes = cehrgpt_args.hyperparameter_batch_sizes
+    num_train_epochs = cehrgpt_args.hyperparameter_num_train_epochs
     return {
-        "learning_rate": trial.suggest_float("learning_rate", *lr_range, log=True),
+        "learning_rate": get_suggestion(
+            trial, "learning_rate", learning_rates, is_grid
+        ),
         "per_device_train_batch_size": trial.suggest_categorical(
             "per_device_train_batch_size", batch_sizes
         ),
-        "weight_decay": trial.suggest_float("weight_decay", *weight_decays, log=True),
+        "weight_decay": get_suggestion(trial, "weight_decay", weight_decays, is_grid),
         "num_train_epochs": trial.suggest_categorical(
             "num_train_epochs", num_train_epochs
         ),
     }
+def create_grid_search_space(cehrgpt_args: CehrGPTArguments):
+    """
+    Create the search space dictionary for GridSampler.
+    Args:
+        cehrgpt_args: CehrGPTArguments
+    Returns:
+        Dictionary defining the grid search space
+    """
+    return {
+        "learning_rate": cehrgpt_args.hyperparameter_learning_rates,
+        "weight_decay": cehrgpt_args.hyperparameter_weight_decays,
+        "per_device_train_batch_size": cehrgpt_args.hyperparameter_batch_sizes,
+        "num_train_epochs": cehrgpt_args.hyperparameter_num_train_epochs,
+    }
+def calculate_total_combinations(search_space: dict) -> int:
+    """Calculate total number of combinations in grid search."""
+    total = 1
+    for values in search_space.values():
+        total *= len(values)
+    return total
 def sample_dataset(data: Dataset, percentage: float, seed: int) -> Dataset:
     """
     Samples a subset of the given dataset based on a specified percentage.
@@ -113,7 +184,7 @@ def sample_dataset(data: Dataset, percentage: float, seed: int) -> Dataset:
           returns the "test" portion, which is the specified percentage of the dataset.
         - Ensure that `percentage` is between 0 and 1 to avoid errors.
     """
-    if percentage == 1.0:
+    if percentage >= 1.0:
         return data
     return data.train_test_split(
@@ -130,14 +201,13 @@ def perform_hyperparameter_search(
     training_args: TrainingArguments,
     model_args: ModelArguments,
     cehrgpt_args: CehrGPTArguments,
-) -> TrainingArguments:
+) -> Tuple[TrainingArguments, Optional[str]]:
     """
     Perform hyperparameter tuning for the CehrGPT model using Optuna with the Hugging Face Trainer.
-    This function initializes a Trainer with sampled training and validation sets, and performs
-    a hyperparameter search using Optuna. The search tunes learning rate, batch size, and weight decay
-    to optimize model performance based on a specified objective metric (e.g., validation loss).
-    After the search, it updates the provided `TrainingArguments` with the best hyperparameters found.
+    This function supports two modes:
+    1. Bayesian Optimization (TPE): Intelligently explores hyperparameter space using bounds
+    2. Grid Search: Exhaustively tests all combinations of discrete values
     Args:
         trainer_class: A Trainer or its subclass
@@ -147,15 +217,15 @@ def perform_hyperparameter_search(
         training_args (TrainingArguments): Configuration for training parameters (e.g., epochs, evaluation strategy).
         model_args (ModelArguments): Model configuration arguments, including early stopping parameters.
         cehrgpt_args (CehrGPTArguments): Additional arguments specific to CehrGPT, including hyperparameter
-                                         tuning options such as learning rate range, batch sizes, and tuning percentage.
+                                         tuning options and search mode configuration.
     Returns:
-        TrainingArguments: Updated `TrainingArguments` instance containing the best hyperparameters found
-                           from the search.
+        Tuple[TrainingArguments, Optional[str]]: Updated TrainingArguments with best hyperparameters
+                                               and optional run_id of the best trial.
     Example:
         ```
-        best_training_args = perform_hyperparameter_search(
+        best_training_args, run_id = perform_hyperparameter_search(
             trainer_class=Trainer,
             model_init=my_model_init,
             dataset=my_dataset_dict,
@@ -176,50 +246,91 @@ def perform_hyperparameter_search(
     Logging:
         Logs the best hyperparameters found at the end of the search.
     """
-    if cehrgpt_args.hyperparameter_tuning:
-        sampled_train = sample_dataset(
-            dataset["train"],
-            cehrgpt_args.hyperparameter_tuning_percentage,
-            training_args.seed,
-        )
-        sampled_val = sample_dataset(
-            dataset["validation"],
-            cehrgpt_args.hyperparameter_tuning_percentage,
-            training_args.seed,
-        )
-        hyperparam_trainer = trainer_class(
-            model_init=model_init,
-            data_collator=data_collator,
-            train_dataset=sampled_train,
-            eval_dataset=sampled_val,
-            callbacks=[
-                EarlyStoppingCallback(model_args.early_stopping_patience),
-                OptunaMetricCallback(),
-            ],
-            args=training_args,
-        )
-        # Perform hyperparameter search
-        best_trial = hyperparam_trainer.hyperparameter_search(
-            direction="minimize",
-            hp_space=partial(
-                hp_space,
-                lr_range=(cehrgpt_args.lr_low, cehrgpt_args.lr_high),
-                weight_decays=(
-                    cehrgpt_args.weight_decays_low,
-                    cehrgpt_args.weight_decays_high,
-                ),
-                batch_sizes=cehrgpt_args.hyperparameter_batch_sizes,
-                num_train_epochs=cehrgpt_args.hyperparameter_num_train_epochs,
-            ),
-            backend="optuna",
-            n_trials=cehrgpt_args.n_trials,
-            compute_objective=lambda m: m["optuna_best_metric"],
-            # Ensure reproducibility
-            sampler=optuna.samplers.TPESampler(seed=training_args.seed),
-        )
-        LOG.info("Best hyperparameters: %s", best_trial.hyperparameters)
-        # Update training arguments with best hyperparameters and set epochs based on adjusted effective epochs
-        for k, v in best_trial.hyperparameters.items():
-            setattr(training_args, k, v)
+    if not cehrgpt_args.hyperparameter_tuning:
+        return training_args, None
+    # Prepare hyperparameters based on mode
+    if (
+        cehrgpt_args.hyperparameter_tuning_is_grid
+        and cehrgpt_args.hyperparameter_tuning_is_grid
+    ):
+        search_space = create_grid_search_space(cehrgpt_args)
+        total_combinations = calculate_total_combinations(search_space)
+        LOG.info(f"Grid search mode: Testing {total_combinations} combinations")
+        LOG.info(f"Search space: {search_space}")
+        # Adjust n_trials for grid search if not set appropriately
+        if cehrgpt_args.n_trials < total_combinations:
+            LOG.warning(
+                f"n_trials ({cehrgpt_args.n_trials}) is less than total combinations ({total_combinations}). "
+                f"Setting n_trials to {total_combinations} to test all combinations."
+            )
+            cehrgpt_args.n_trials = total_combinations
+        # Configure sampler based on search mode
+        sampler = optuna.samplers.GridSampler(search_space, seed=training_args.seed)
+    else:
+        LOG.info("Bayesian optimization mode (TPE)")
+        LOG.info(f"Learning rate bounds: {cehrgpt_args.hyperparameter_learning_rates}")
+        LOG.info(f"Weight decay bounds: {cehrgpt_args.hyperparameter_weight_decays}")
+        LOG.info(f"Batch sizes: {cehrgpt_args.hyperparameter_batch_sizes}")
+        LOG.info(f"Epochs: {cehrgpt_args.hyperparameter_num_train_epochs}")
+        # Configure the TPE sampler
+        sampler = optuna.samplers.TPESampler(seed=training_args.seed)
+    # Prepare datasets
+    save_total_limit_original = training_args.save_total_limit
+    training_args.save_total_limit = 1
+    sampled_train = sample_dataset(
+        dataset["train"],
+        cehrgpt_args.hyperparameter_tuning_percentage,
+        training_args.seed,
+    )
+    sampled_val = sample_dataset(
+        dataset["validation"],
+        cehrgpt_args.hyperparameter_tuning_percentage,
+        training_args.seed,
+    )
+    # Create trainer
+    hyperparam_trainer = trainer_class(
+        model_init=model_init,
+        data_collator=data_collator,
+        train_dataset=sampled_train,
+        eval_dataset=sampled_val,
+        callbacks=[
+            EarlyStoppingCallback(model_args.early_stopping_patience),
+            OptunaMetricCallback(),
+        ],
+        args=training_args,
+    )
+    best_trial = hyperparam_trainer.hyperparameter_search(
+        direction="minimize",
+        hp_space=partial(
+            hp_space,
+            cehrgpt_args=cehrgpt_args,
+        ),
+        backend="optuna",
+        n_trials=cehrgpt_args.n_trials,
+        compute_objective=lambda m: m["optuna_best_metric"],
+        sampler=sampler,
+    )
+    # Log results
+    LOG.info("=" * 50)
+    LOG.info("HYPERPARAMETER SEARCH COMPLETED")
+    LOG.info("=" * 50)
+    LOG.info(f"Best hyperparameters: {best_trial.hyperparameters}")
+    LOG.info(f"Best metric (eval_loss): {best_trial.objective}")
+    LOG.info(f"Best run_id: {best_trial.run_id}")
+    LOG.info("=" * 50)
+    # Restore original settings and update with best hyperparameters
+    training_args.save_total_limit = save_total_limit_original
+    for k, v in best_trial.hyperparameters.items():
+        setattr(training_args, k, v)
+        LOG.info(f"Updated training_args.{k} = {v}")
-    return training_args
+    return training_args, best_trial.run_id

cehrgpt/runners/sample_packing_trainer.py CHANGED Viewed

@@ -1,9 +1,10 @@
 from typing import Optional, Union
+import torch
 from datasets import Dataset
 from torch.utils.data import DataLoader
 from transformers import Trainer
-from transformers.trainer_utils import has_length
+from transformers.trainer_utils import has_length, seed_worker
 from transformers.utils import import_utils, logging
 from cehrgpt.data.sample_packing_sampler import SamplePackingBatchSampler
@@ -62,7 +63,10 @@ class SamplePackingTrainer(Trainer):
             if "num_of_concepts" in train_dataset.column_names:
                 lengths = train_dataset["num_of_concepts"]
             else:
-                lengths = [len(sample["input_ids"]) for sample in train_dataset]
+                lengths = [
+                    len(sample["input_ids"])
+                    for sample in train_dataset.select_columns("input_ids")
+                ]
             LOG.info("Finished computing lengths for the train dataset")
         else:
@@ -102,6 +106,11 @@ class SamplePackingTrainer(Trainer):
             "persistent_workers": self.args.dataloader_persistent_workers,
             "batch_sampler": batch_sampler,
         }
+        if not isinstance(train_dataset, torch.utils.data.IterableDataset):
+            dataloader_params["drop_last"] = self.args.dataloader_drop_last
+            dataloader_params["worker_init_fn"] = seed_worker
+            dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor
         return self.accelerator.prepare(DataLoader(train_dataset, **dataloader_params))
     def get_eval_dataloader(

cehrgpt/tools/linear_prob/compute_cehrgpt_features.py CHANGED Viewed

@@ -1,8 +1,8 @@
+import datetime
 import glob
 import os
 import shutil
 import uuid
-from datetime import datetime
 from functools import partial
 from pathlib import Path
 from typing import Optional, Union
@@ -15,6 +15,9 @@ import torch.distributed as dist
 from cehrbert.data_generators.hf_data_generator.meds_utils import CacheFileCollector
 from cehrbert.runners.runner_util import generate_prepared_ds_path
 from datasets import concatenate_datasets, load_from_disk
+from torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook import (
+    batched_powerSGD_hook,
+)
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 from transformers.trainer_utils import is_main_process
@@ -25,7 +28,6 @@ from cehrgpt.data.hf_cehrgpt_dataset_collator import (
     CehrGptDataCollator,
     SamplePackingCehrGptDataCollator,
 )
-from cehrgpt.data.hf_cehrgpt_dataset_mapping import ExtractTokenizedSequenceDataMapping
 from cehrgpt.data.sample_packing_sampler import SamplePackingBatchSampler
 from cehrgpt.models.hf_cehrgpt import (
     CEHRGPT2Model,
@@ -159,24 +161,7 @@ def main():
                 final_splits = prepare_finetune_dataset(
                     data_args, training_args, cehrgpt_args, cache_file_collector
                 )
-                if cehrgpt_args.expand_tokenizer:
-                    new_tokenizer_path = os.path.expanduser(training_args.output_dir)
-                    if tokenizer_exists(new_tokenizer_path):
-                        cehrgpt_tokenizer = CehrGptTokenizer.from_pretrained(
-                            new_tokenizer_path
-                        )
-                    else:
-                        cehrgpt_tokenizer = CehrGptTokenizer.expand_trained_tokenizer(
-                            cehrgpt_tokenizer=cehrgpt_tokenizer,
-                            dataset=final_splits["train"],
-                            data_args=data_args,
-                            concept_name_mapping={},
-                        )
-                        cehrgpt_tokenizer.save_pretrained(
-                            os.path.expanduser(training_args.output_dir)
-                        )
-                    # TODO: temp solution, this column is mixed typed and causes an issue when transforming the data
+                # TODO: temp solution, this column is mixed typed and causes an issue when transforming the data
                 if not data_args.streaming:
                     all_columns = final_splits["train"].column_names
                     if "visit_concept_ids" in all_columns:
@@ -238,10 +223,6 @@ def main():
             len(processed_dataset["test"]),
         )
-    LOG.info(f"cehrgpt_model.config.vocab_size: {cehrgpt_model.config.vocab_size}")
-    LOG.info(f"cehrgpt_tokenizer.vocab_size: {cehrgpt_tokenizer.vocab_size}")
-    if cehrgpt_model.config.vocab_size < cehrgpt_tokenizer.vocab_size:
-        cehrgpt_model.resize_token_embeddings(cehrgpt_tokenizer.vocab_size)
     if (
         cehrgpt_model.config.max_position_embeddings
         < model_args.max_position_embeddings
@@ -264,7 +245,6 @@ def main():
             SamplePackingCehrGptDataCollator,
             cehrgpt_args.max_tokens_per_batch,
             cehrgpt_model.config.max_position_embeddings,
-            add_end_token_in_sample_packing=cehrgpt_args.add_end_token_in_sample_packing,
         )
         train_batch_sampler = SamplePackingBatchSampler(
             lengths=train_set["num_of_concepts"],
@@ -339,10 +319,12 @@ def main():
                 for data_dir in [data_args.data_folder, data_args.test_data_folder]
             ]
         )
-        # This is a pre-caution in case the index_date is not a datetime type
-        demographics_df["index_date"] = pd.to_datetime(
-            demographics_df["index_date"]
-        ).dt.date
+        demographics_df["index_date"] = (
+            demographics_df["index_date"].dt.tz_localize("UTC")
+            - datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc)
+        ).dt.total_seconds()
         demographics_dict = {
             (row["person_id"], row["index_date"]): {
                 "gender_concept_id": row["gender_concept_id"],
@@ -353,7 +335,7 @@ def main():
     data_loaders = [("train", train_loader), ("test", test_dataloader)]
-    ve_token_id = cehrgpt_tokenizer._convert_token_to_id("[VE]")
+    ve_token_id = cehrgpt_tokenizer.ve_token_id
     for split, data_loader in data_loaders:
         # Ensure prediction folder exists
         feature_output_folder = (
@@ -379,9 +361,16 @@ def main():
                 prediction_time_posix = batch.pop("index_date").numpy().squeeze()
                 if prediction_time_posix.ndim == 0:
                     prediction_time_posix = np.asarray([prediction_time_posix])
                 prediction_time = list(
-                    map(datetime.fromtimestamp, prediction_time_posix)
+                    map(
+                        lambda posix_time: datetime.datetime.utcfromtimestamp(
+                            posix_time
+                        ).replace(tzinfo=None),
+                        prediction_time_posix,
+                    )
                 )
                 labels = (
                     batch.pop("classifier_label")
                     .float()
@@ -393,6 +382,13 @@ def main():
                 if labels.ndim == 0:
                     labels = np.asarray([labels])
+                # Right now the model does not support this column, we need to pop it
+                if "epoch_times" in batch:
+                    batch.pop("epoch_times")
+                if "ages" in batch:
+                    batch.pop("ages")
                 batch = {k: v.to(device) for k, v in batch.items()}
                 # Forward pass
                 cehrgpt_output = cehrgpt_model(

cehrgpt 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

cehrgpt 0.1.1py3-none-any.whl → 0.1.3py3-none-any.whl