PyPI - cehrgpt - Versions diffs - 0.0.2__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

cehrgpt 0.0.2py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

cehrgpt/analysis/irregularity.py +36 -0
cehrgpt/data/hf_cehrgpt_dataset.py +25 -4
cehrgpt/data/hf_cehrgpt_dataset_collator.py +635 -97
cehrgpt/data/hf_cehrgpt_dataset_mapping.py +308 -95
cehrgpt/data/sample_packing_sampler.py +181 -0
cehrgpt/generation/generate_batch_hf_gpt_sequence.py +12 -9
cehrgpt/generation/omop_converter_batch.py +32 -2
cehrgpt/gpt_utils.py +20 -2
cehrgpt/models/config.py +35 -0
cehrgpt/models/hf_cehrgpt.py +470 -106
cehrgpt/models/hf_modeling_outputs.py +1 -0
cehrgpt/models/special_tokens.py +1 -0
cehrgpt/models/tokenization_hf_cehrgpt.py +358 -71
cehrgpt/runners/data_utils.py +358 -0
cehrgpt/runners/gpt_runner_util.py +0 -10
cehrgpt/runners/hf_cehrgpt_finetune_runner.py +181 -283
cehrgpt/runners/hf_cehrgpt_pretrain_runner.py +288 -112
cehrgpt/runners/hf_gpt_runner_argument_dataclass.py +90 -0
cehrgpt/runners/hyperparameter_search_util.py +10 -8
cehrgpt/runners/sample_packing_trainer.py +185 -0
cehrgpt/simulations/generate_plots.py +95 -0
cehrgpt/simulations/run_simulation.sh +24 -0
cehrgpt/simulations/time_embedding_simulation.py +250 -0
cehrgpt/simulations/time_token_simulation.py +177 -0
cehrgpt/time_to_event/config/1_year_cabg.yaml +23 -0
cehrgpt/time_to_event/time_to_event_model.py +2 -13
cehrgpt/time_to_event/time_to_event_prediction.py +27 -13
cehrgpt/tools/linear_prob/__init__.py +0 -0
cehrgpt/tools/linear_prob/compute_cehrgpt_features.py +495 -0
cehrgpt/tools/linear_prob/train_with_cehrgpt_features.py +152 -0
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info}/METADATA +11 -8
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info}/RECORD +36 -32
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info}/WHEEL +1 -1
cehrgpt/data/hf_cehrgpt_dpo_collator.py +0 -71
cehrgpt/data/hf_cehrgpt_dpo_dataset_mapping.py +0 -61
cehrgpt/generation/generate_paired_cehrgpt_sequence.py +0 -224
cehrgpt/rl_finetune/cehrgpt_dpo_trainer.py +0 -586
cehrgpt/rl_finetune/cehrgpt_ppo_trainer.py +0 -464
cehrgpt/rl_finetune/ppo_finetune.py +0 -394
cehrgpt/rl_finetune/ppo_finetune_v2.py +0 -373
cehrgpt/runners/hf_cehrgpt_dpo_runner.py +0 -119
/cehrgpt/{rl_finetune → simulations}/__init__.py +0 -0
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info/licenses}/LICENSE +0 -0
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info}/top_level.txt +0 -0

cehrgpt/analysis/irregularity.py ADDED Viewed

@@ -0,0 +1,36 @@
+import os
+import polars as pl
+from cehrgpt.gpt_utils import extract_time_interval_in_days, is_att_token
+def main(args):
+    dataset = pl.read_parquet(os.path.join(args.input_dir, "*.parquet"))
+    time_token_frequency_df = (
+        dataset.select(pl.col("concept_ids").explode().alias("concept_id"))
+        .filter(pl.col("concept_id").map_elements(is_att_token))
+        .with_columns(
+            pl.col("concept_id")
+            .map_elements(extract_time_interval_in_days)
+            .alias("time_interval")
+        )
+    )
+    results = time_token_frequency_df.select(
+        pl.mean("time_interval").alias("mean"), pl.std("time_interval").alias("std")
+    ).to_dicts()[0]
+    print(results)
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="EHR Irregularity analysis")
+    parser.add_argument(
+        "--input_dir",
+        dest="input_dir",
+        action="store",
+        help="The path for where the input data folder",
+        required=True,
+    )
+    main(parser.parse_args())

cehrgpt/data/hf_cehrgpt_dataset.py CHANGED Viewed

@@ -1,9 +1,10 @@
-from typing import Union
+from typing import Optional, Union
 from cehrbert.data_generators.hf_data_generator.hf_dataset import (
     FINETUNING_COLUMNS,
     apply_cehrbert_dataset_mapping,
 )
+from cehrbert.data_generators.hf_data_generator.meds_utils import CacheFileCollector
 from cehrbert.runners.hf_runner_argument_dataclass import DataTrainingArguments
 from datasets import Dataset, DatasetDict
@@ -22,6 +23,7 @@ CEHRGPT_COLUMNS = [
     "num_of_visits",
     "values",
     "value_indicators",
+    "epoch_times",
 ]
 TRANSFORMER_COLUMNS = ["input_ids"]
@@ -31,16 +33,25 @@ def create_cehrgpt_pretraining_dataset(
     dataset: Union[Dataset, DatasetDict],
     cehrgpt_tokenizer: CehrGptTokenizer,
     data_args: DataTrainingArguments,
-) -> Dataset:
+    cache_file_collector: Optional[CacheFileCollector] = None,
+) -> Union[Dataset, DatasetDict]:
     required_columns = TRANSFORMER_COLUMNS + CEHRGPT_COLUMNS
+    # TODO: temp solution, this column is mixed typed and causes an issue when transforming the data
+    if not data_args.streaming:
+        if isinstance(dataset, DatasetDict):
+            all_columns = dataset["train"].column_names
+        else:
+            all_columns = dataset.column_names
+        if "visit_concept_ids" in all_columns:
+            dataset.remove_columns(["visit_concept_ids"])
     dataset = apply_cehrbert_dataset_mapping(
         dataset,
         HFCehrGptTokenizationMapping(cehrgpt_tokenizer),
         num_proc=data_args.preprocessing_num_workers,
         batch_size=data_args.preprocessing_batch_size,
         streaming=data_args.streaming,
+        cache_file_collector=cache_file_collector,
     )
     if not data_args.streaming:
         if isinstance(dataset, DatasetDict):
             all_columns = dataset["train"].column_names
@@ -56,8 +67,17 @@ def create_cehrgpt_finetuning_dataset(
     dataset: Union[Dataset, DatasetDict],
     cehrgpt_tokenizer: CehrGptTokenizer,
     data_args: DataTrainingArguments,
-) -> Dataset:
+    cache_file_collector: Optional[CacheFileCollector] = None,
+) -> Union[Dataset, DatasetDict]:
     required_columns = TRANSFORMER_COLUMNS + CEHRGPT_COLUMNS + FINETUNING_COLUMNS
+    # TODO: temp solution, this column is mixed typed and causes an issue when transforming the data
+    if not data_args.streaming:
+        if isinstance(dataset, DatasetDict):
+            all_columns = dataset["train"].column_names
+        else:
+            all_columns = dataset.column_names
+        if "visit_concept_ids" in all_columns:
+            dataset.remove_columns(["visit_concept_ids"])
     mapping_functions = [
         HFFineTuningMapping(cehrgpt_tokenizer),
     ]
@@ -68,6 +88,7 @@ def create_cehrgpt_finetuning_dataset(
             num_proc=data_args.preprocessing_num_workers,
             batch_size=data_args.preprocessing_batch_size,
             streaming=data_args.streaming,
+            cache_file_collector=cache_file_collector,
         )
     if not data_args.streaming:

cehrgpt 0.0.2__py3-none-any.whl → 0.1.1__py3-none-any.whl

cehrgpt 0.0.2py3-none-any.whl → 0.1.1py3-none-any.whl