PyPI - cehrgpt - Versions diffs - 0.0.2__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

cehrgpt 0.0.2py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

cehrgpt/analysis/irregularity.py +36 -0
cehrgpt/data/hf_cehrgpt_dataset.py +25 -4
cehrgpt/data/hf_cehrgpt_dataset_collator.py +635 -97
cehrgpt/data/hf_cehrgpt_dataset_mapping.py +308 -95
cehrgpt/data/sample_packing_sampler.py +181 -0
cehrgpt/generation/generate_batch_hf_gpt_sequence.py +12 -9
cehrgpt/generation/omop_converter_batch.py +32 -2
cehrgpt/gpt_utils.py +20 -2
cehrgpt/models/config.py +35 -0
cehrgpt/models/hf_cehrgpt.py +470 -106
cehrgpt/models/hf_modeling_outputs.py +1 -0
cehrgpt/models/special_tokens.py +1 -0
cehrgpt/models/tokenization_hf_cehrgpt.py +358 -71
cehrgpt/runners/data_utils.py +358 -0
cehrgpt/runners/gpt_runner_util.py +0 -10
cehrgpt/runners/hf_cehrgpt_finetune_runner.py +181 -283
cehrgpt/runners/hf_cehrgpt_pretrain_runner.py +288 -112
cehrgpt/runners/hf_gpt_runner_argument_dataclass.py +90 -0
cehrgpt/runners/hyperparameter_search_util.py +10 -8
cehrgpt/runners/sample_packing_trainer.py +185 -0
cehrgpt/simulations/generate_plots.py +95 -0
cehrgpt/simulations/run_simulation.sh +24 -0
cehrgpt/simulations/time_embedding_simulation.py +250 -0
cehrgpt/simulations/time_token_simulation.py +177 -0
cehrgpt/time_to_event/config/1_year_cabg.yaml +23 -0
cehrgpt/time_to_event/time_to_event_model.py +2 -13
cehrgpt/time_to_event/time_to_event_prediction.py +27 -13
cehrgpt/tools/linear_prob/__init__.py +0 -0
cehrgpt/tools/linear_prob/compute_cehrgpt_features.py +495 -0
cehrgpt/tools/linear_prob/train_with_cehrgpt_features.py +152 -0
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info}/METADATA +11 -8
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info}/RECORD +36 -32
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info}/WHEEL +1 -1
cehrgpt/data/hf_cehrgpt_dpo_collator.py +0 -71
cehrgpt/data/hf_cehrgpt_dpo_dataset_mapping.py +0 -61
cehrgpt/generation/generate_paired_cehrgpt_sequence.py +0 -224
cehrgpt/rl_finetune/cehrgpt_dpo_trainer.py +0 -586
cehrgpt/rl_finetune/cehrgpt_ppo_trainer.py +0 -464
cehrgpt/rl_finetune/ppo_finetune.py +0 -394
cehrgpt/rl_finetune/ppo_finetune_v2.py +0 -373
cehrgpt/runners/hf_cehrgpt_dpo_runner.py +0 -119
/cehrgpt/{rl_finetune → simulations}/__init__.py +0 -0
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info/licenses}/LICENSE +0 -0
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info}/top_level.txt +0 -0

cehrgpt/simulations/time_token_simulation.py ADDED Viewed

@@ -0,0 +1,177 @@
+from typing import Optional, Tuple
+import numpy as np
+import torch
+import torch.optim as optim
+from sklearn.metrics import accuracy_score, roc_auc_score
+from torch.nn import CrossEntropyLoss
+from transformers import BertConfig, BertModel
+from cehrgpt.simulations.time_embedding_simulation import generate_simulation_data
+class ModelTimeToken(torch.nn.Module):
+    def __init__(self, vocab_size: int):
+        super(ModelTimeToken, self).__init__()
+        self.embedding = torch.nn.Embedding(vocab_size, 16)
+        self.bert = BertModel(
+            BertConfig(
+                vocab_size=vocab_size,
+                hidden_size=16,
+                num_attention_heads=2,
+                num_hidden_layers=2,
+                intermediate_size=32,
+                hidden_dropout_prob=0.0,
+                attention_probs_dropout_prob=0.0,
+                max_position_embeddings=3,
+            ),
+            add_pooling_layer=False,
+        )
+        self.linear = torch.nn.Linear(48, 2)
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        labels: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
+        bz = input_ids.shape[0]
+        x = self.embedding(input_ids)
+        bert_output = self.bert.forward(inputs_embeds=x, return_dict=True)
+        output = bert_output.last_hidden_state.reshape((bz, 48))
+        y = self.linear(output)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(y, labels)
+        return loss, y
+def create_time_token_tokenizer(simulated_data):
+    vocab = []
+    for row in simulated_data:
+        x1, x2, t1, t2, y = row
+        x1 = f"c-{x1}"
+        x2 = f"c-{x2}"
+        t = f"t-{t2 - t1}"
+        if x1 not in vocab:
+            vocab.append(x1)
+        if x2 not in vocab:
+            vocab.append(x2)
+        if t not in vocab:
+            vocab.append(t)
+    return {c: i + 1 for i, c in enumerate(vocab)}
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+def eval_step(simulated_data, time_token_tokenizer, time_embedding_model):
+    time_embedding_model.eval()
+    eval_input_ids = []
+    eval_y = []
+    for row in simulated_data:
+        x1, x2, t1, t2, y = row
+        x1 = f"c-{x1}"
+        x2 = f"c-{x2}"
+        t = f"t-{t2 - t1}"
+        eval_input_ids.append(
+            [
+                time_token_tokenizer[x1],
+                time_token_tokenizer[t],
+                time_token_tokenizer[x2],
+            ]
+        )
+        eval_y.append(y)
+    with torch.no_grad():
+        batched_input_ids = torch.tensor(eval_input_ids, dtype=torch.long).to(device)
+        batched_y = np.asarray(eval_y)
+        # Compute loss and forward pass
+        _, y_pred = time_embedding_model(batched_input_ids)
+        y_probs = torch.nn.functional.softmax(y_pred, dim=1)
+        y_probs = y_probs.detach().cpu().numpy()
+        roc_auc = roc_auc_score(batched_y, y_probs[:, 1])
+        accuracy = accuracy_score(batched_y, y_probs[:, 1] > y_probs[:, 0])
+        print(f"ROC AUC: {roc_auc}")
+        print(f"Accuracy: {accuracy}")
+    return accuracy, roc_auc
+def train_step(
+    simulated_data, time_token_tokenizer, time_embedding_model, time_embedding_optimizer
+):
+    batched_input_ids = []
+    batched_y = []
+    indices = np.random.choice(simulated_data.shape[0], size=8, replace=False)
+    for row in simulated_data[indices, :]:
+        x1, x2, t1, t2, y = row
+        x1 = f"c-{x1}"
+        x2 = f"c-{x2}"
+        t = f"t-{t2 - t1}"
+        batched_input_ids.append(
+            [
+                time_token_tokenizer[x1],
+                time_token_tokenizer[t],
+                time_token_tokenizer[x2],
+            ]
+        )
+        batched_y.append(y)
+    batched_input_ids = torch.tensor(batched_input_ids, dtype=torch.long).to(device)
+    batched_y = torch.tensor(batched_y, dtype=torch.long).to(device)
+    # Zero the gradients
+    time_embedding_optimizer.zero_grad()
+    # Compute loss and forward pass
+    loss, _ = time_embedding_model(batched_input_ids, batched_y)
+    # Backward pass (compute gradients)
+    loss.backward()
+    # Update model parameters
+    time_embedding_optimizer.step()
+    return loss
+def main(args):
+    simulated_data = generate_simulation_data(args.n_samples)
+    time_token_tokenizer = create_time_token_tokenizer(simulated_data)
+    time_embedding_model = ModelTimeToken(len(time_token_tokenizer) + 1).to(device)
+    time_embedding_optimizer = optim.Adam(time_embedding_model.parameters(), lr=0.001)
+    steps = []
+    roc_aucs = []
+    accuracies = []
+    for step in range(args.n_steps):
+        loss = train_step(
+            simulated_data,
+            time_token_tokenizer,
+            time_embedding_model,
+            time_embedding_optimizer,
+        )
+        print(f"Step {step}: Loss = {loss.item()}")
+        # Evaluation
+        if (
+            args.n_steps % args.eval_frequency == 0
+            and args.n_steps > args.eval_frequency
+        ):
+            accuracy, roc_auc = eval_step(
+                simulated_data, time_token_tokenizer, time_embedding_model
+            )
+            steps.append(step)
+            roc_aucs.append(roc_auc)
+            accuracies.append(accuracy)
+    return {"steps": steps, "roc_auc": roc_aucs, "accuracy": accuracies}
+if __name__ == "__main__":
+    import argparse
+    import json
+    from pathlib import Path
+    parser = argparse.ArgumentParser("Model with time token simulation")
+    parser.add_argument("--output_dir", type=str, required=True)
+    parser.add_argument("--n_steps", type=int, default=10000)
+    parser.add_argument("--n_samples", type=int, default=1000)
+    parser.add_argument("--batch_size", type=int, default=128)
+    parser.add_argument("--eval_frequency", type=int, default=100)
+    args = parser.parse_args()
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(exist_ok=True, parents=True)
+    metrics = main(args)
+    with open(output_dir / "time_token_metrics.json", "w") as f:
+        json.dump(metrics, f)

cehrgpt/time_to_event/config/1_year_cabg.yaml ADDED Viewed

@@ -0,0 +1,23 @@
+task_name: "cabg_prediction"
+outcome_events: [
+    "43528001",
+    "43528003",
+    "43528004",
+    "43528002",
+    "4305852",
+    "4168831",
+    "2107250",
+    "2107216",
+    "2107222",
+    "2107231",
+    "4336464",
+    "4231998",
+    "4284104",
+    "2100873",
+]
+future_visit_start: 0
+future_visit_end: -1
+prediction_window_start: 0
+prediction_window_end: 365
+max_new_tokens: 1024
+include_descendants: true

cehrgpt/time_to_event/time_to_event_model.py CHANGED Viewed

@@ -80,20 +80,9 @@ class TimeToEventModel:
         return token in self.outcome_events
     def simulate(
-        self, partial_history: Union[np.ndarray, List[str]]
+        self,
+        partial_history: Union[np.ndarray, List[str]],
     ) -> List[List[str]]:
-        sequence_is_demographics = len(partial_history) == 4 and partial_history[
-            0
-        ].startswith("year")
-        sequence_ends_ve = is_visit_end(partial_history[-1])
-        if not (sequence_is_demographics | sequence_ends_ve):
-            raise ValueError(
-                "There are only two types of sequences allowed. 1) the sequence only contains "
-                "demographics; 2) the sequence ends on VE;"
-            )
         token_ids = self.tokenizer.encode(partial_history)
         prompt = torch.tensor(token_ids).unsqueeze(0).to(self.device)

cehrgpt/time_to_event/time_to_event_prediction.py CHANGED Viewed

@@ -118,9 +118,9 @@ def main(args):
     LOG.info(f"Top P {args.top_p}")
     LOG.info(f"Top K {args.top_k}")
-    cehrgpt_model.resize_position_embeddings(
-        cehrgpt_model.config.max_position_embeddings + task_config.max_new_tokens
-    )
+    # cehrgpt_model.resize_position_embeddings(
+    #     cehrgpt_model.config.max_position_embeddings + task_config.max_new_tokens
+    # )
     generation_config = TimeToEventModel.get_generation_config(
         tokenizer=cehrgpt_tokenizer,
@@ -190,14 +190,22 @@ def main(args):
             args.max_n_trial,
         )
         visit_counter = sum([int(is_visit_end(_)) for _ in partial_history])
+        predicted_boolean_probability = (
+            sum([event != "0" for event in concept_time_to_event.outcome_events])
+            / len(concept_time_to_event.outcome_events)
+            if concept_time_to_event
+            else 0.0
+        )
         tte_outputs.append(
             {
-                "person_id": record["person_id"],
-                "index_date": record["index_date"],
+                "subject_id": record["person_id"],
+                "prediction_time": record["index_date"],
                 "visit_counter": visit_counter,
-                "label": label,
+                "boolean_value": label,
+                "predicted_boolean_probability": predicted_boolean_probability,
+                "predicted_boolean_value": None,
                 "time_to_event": time_to_event,
-                "prediction": (
+                "trials": (
                     asdict(concept_time_to_event) if concept_time_to_event else None
                 ),
             }
@@ -263,9 +271,13 @@ def filter_out_existing_results(
     parquet_files = glob.glob(os.path.join(prediction_output_folder_name, "*parquet"))
     if parquet_files:
         cohort_members = set()
-        results_dataframe = pd.read_parquet(parquet_files)[["person_id", "index_date"]]
+        results_dataframe = pd.read_parquet(parquet_files)[
+            ["subject_id", "prediction_time"]
+        ]
         for row in results_dataframe.itertuples():
-            cohort_members.add((row.person_id, row.index_date.strftime("%Y-%m-%d")))
+            cohort_members.add(
+                (row.subject_id, row.prediction_time.strftime("%Y-%m-%d"))
+            )
         def filter_func(batched):
             return [
@@ -292,12 +304,14 @@ def flush_to_disk_if_full(
         pd.DataFrame(
             tte_outputs,
             columns=[
-                "person_id",
-                "index_date",
+                "subject_id",
+                "prediction_time",
                 "visit_counter",
-                "label",
+                "boolean_value",
+                "predicted_boolean_probability",
+                "predicted_boolean_value",
                 "time_to_event",
-                "prediction",
+                "trials",
             ],
         ).to_parquet(output_parquet_file)
         tte_outputs.clear()

cehrgpt/tools/linear_prob/__init__.py ADDED Viewed

File without changes

cehrgpt 0.0.2__py3-none-any.whl → 0.1.1__py3-none-any.whl

cehrgpt 0.0.2py3-none-any.whl → 0.1.1py3-none-any.whl