PyPI - cehrgpt - Versions diffs - 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl - Mend

cehrgpt 0.0.1py3-none-any.whl → 0.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

cehrgpt/data/hf_cehrgpt_dataset_mapping.py +267 -1
cehrgpt/data/hf_cehrgpt_dpo_collator.py +71 -0
cehrgpt/data/hf_cehrgpt_dpo_dataset_mapping.py +61 -0
cehrgpt/generation/generate_paired_cehrgpt_sequence.py +224 -0
cehrgpt/generation/omop_converter_batch.py +3 -0
cehrgpt/models/hf_cehrgpt.py +1 -0
cehrgpt/models/tokenization_hf_cehrgpt.py +2 -2
cehrgpt/rl_finetune/__init__.py +0 -0
cehrgpt/rl_finetune/cehrgpt_dpo_trainer.py +586 -0
cehrgpt/rl_finetune/cehrgpt_ppo_trainer.py +464 -0
cehrgpt/rl_finetune/ppo_finetune.py +394 -0
cehrgpt/rl_finetune/ppo_finetune_v2.py +373 -0
cehrgpt/runners/hf_cehrgpt_dpo_runner.py +119 -0
cehrgpt/runners/hf_cehrgpt_finetune_runner.py +24 -3
cehrgpt/runners/hf_cehrgpt_pretrain_runner.py +44 -8
cehrgpt/runners/hf_gpt_runner_argument_dataclass.py +4 -0
cehrgpt/tools/generate_causal_patient_split_by_age.py +146 -0
{cehrgpt-0.0.1.dist-info → cehrgpt-0.0.2.dist-info}/METADATA +52 -6
{cehrgpt-0.0.1.dist-info → cehrgpt-0.0.2.dist-info}/RECORD +22 -12
{cehrgpt-0.0.1.dist-info → cehrgpt-0.0.2.dist-info}/WHEEL +1 -1
{cehrgpt-0.0.1.dist-info → cehrgpt-0.0.2.dist-info}/LICENSE +0 -0
{cehrgpt-0.0.1.dist-info → cehrgpt-0.0.2.dist-info}/top_level.txt +0 -0

cehrgpt/tools/generate_causal_patient_split_by_age.py ADDED Viewed

@@ -0,0 +1,146 @@
+import numpy as np
+import pandas as pd
+# Define race mapping
+race_mapping = {
+    "38003613": "8557",
+    "38003610": "8557",
+    "38003579": "8515",
+    "44814653": "0",
+}
+# Invalid age groups
+invalid_age_groups = [
+    "age:100-110",
+    "age:110-120",
+    "age:120-130",
+    "age:130-140",
+    "age:140-150",
+    "age:150-160",
+    "age:160-170",
+    "age:170-180",
+    "age:180-190",
+    "age:190-200",
+    "age:640-650",
+    "age:680-690",
+    "age:730-740",
+    "age:740-750",
+    "age:890-900",
+    "age:900-910",
+    "age:-10-0",
+]
+def age_group_func(age_str):
+    """
+    Categorize an age into a 10-year age group.
+    Args:
+        age_str (str): A string containing the age in the format "age:XX".
+    Returns:
+        str: A string representing the 10-year age group "age:XX-XX".
+    """
+    age = int(age_str.split(":")[1])
+    group_number = age // 10
+    return f"age:{group_number * 10}-{(group_number + 1) * 10}"
+def map_race(race):
+    return race_mapping.get(race, race)
+def main(args):
+    # Load data
+    patient_sequence = pd.read_parquet(args.patient_sequence)
+    # Extract and preprocess demographics
+    demographics = patient_sequence.concept_ids.apply(
+        lambda concept_ids: concept_ids[:4]
+    )
+    patient_sequence["demographics"] = demographics
+    year = demographics.apply(lambda concepts: concepts[0])
+    age = demographics.apply(lambda concepts: concepts[1]).apply(age_group_func)
+    gender = demographics.apply(lambda concepts: concepts[2])
+    race = demographics.apply(lambda concepts: concepts[3])
+    death = patient_sequence.concept_ids.apply(
+        lambda concept_ids: int(concept_ids[-2] == "[DEATH]")
+    )
+    patient_sequence["year"] = year
+    patient_sequence["age"] = age
+    patient_sequence["gender"] = gender
+    patient_sequence["race"] = race
+    patient_sequence["death"] = death
+    demographics = patient_sequence[
+        ["person_id", "death", "year", "age", "gender", "race", "split"]
+    ]
+    demographics["race"] = demographics.race.apply(map_race)
+    demographics_clean = demographics[
+        (demographics.gender != "0") & (~demographics.age.isin(invalid_age_groups))
+    ]
+    patient_sequence_clean = patient_sequence[
+        patient_sequence.person_id.isin(demographics_clean.person_id)
+    ]
+    # Calculate probabilities
+    probs = (
+        demographics_clean.groupby(["age"])["person_id"].count()
+        / len(demographics_clean)
+    ).reset_index()
+    probs.rename(columns={"person_id": "prob"}, inplace=True)
+    # Adjust probabilities
+    np.random.seed(42)
+    x = np.asarray(list(reversed(range(1, 11))))
+    adjusted_probs = probs.prob * pd.Series(x)
+    adjusted_probs = adjusted_probs / adjusted_probs.sum()
+    probs["adjusted_prob"] = adjusted_probs
+    demographics_for_sampling = patient_sequence_clean[
+        ["year", "age", "race", "gender", "person_id"]
+    ].merge(probs, on="age")
+    demographics_for_sampling["adjusted_prob"] = (
+        demographics_for_sampling.adjusted_prob
+        / demographics_for_sampling.adjusted_prob.sum()
+    )
+    # Train/Validation Split
+    causal_train_split = demographics_for_sampling.sample(
+        args.num_patients, replace=False, weights="adjusted_prob", random_state=1
+    )
+    causal_train_split["split"] = "train"
+    causal_val_split = demographics_for_sampling[
+        ~demographics_for_sampling.person_id.isin(causal_train_split.person_id)
+    ]
+    causal_val_split["split"] = "validation"
+    causal_train_val_split = pd.concat([causal_train_split, causal_val_split])
+    # Save outputs
+    causal_train_val_split.to_parquet(args.output_folder, index=False)
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(
+        description="Arguments for a causal patient split by age groups"
+    )
+    parser.add_argument(
+        "--patient_sequence",
+        required=True,
+    )
+    parser.add_argument(
+        "--num_patients",
+        default=1_000_000,
+        type=int,
+        required=False,
+    )
+    parser.add_argument(
+        "--output_folder",
+        required=True,
+    )
+    # Call the main function with parsed arguments
+    main(parser.parse_args())

{cehrgpt-0.0.1.dist-info → cehrgpt-0.0.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: cehrgpt
-Version: 0.0.1
+Version: 0.0.2
 Summary: CEHR-GPT: Generating Electronic Health Records with Chronological Patient Timelines
 Author-email: Chao Pang <chaopang229@gmail.com>, Xinzhuo Jiang <xj2193@cumc.columbia.edu>, Krishna Kalluri <kk3326@cumc.columbia.edu>, Elise Minto <em3697@cumc.columbia.edu>, Jason Patterson <jp3477@cumc.columbia.edu>, Nishanth Parameshwar Pavinkurve <np2689@cumc.columbia.edu>, Karthik Natarajan <kn2174@cumc.columbia.edu>
 License: MIT License
@@ -12,11 +12,12 @@ Classifier: Programming Language :: Python :: 3
 Requires-Python: >=3.10.0
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: cehrbert==1.3.3
+Requires-Dist: cehrbert==1.3.8
 Requires-Dist: openai==1.54.3
 Requires-Dist: optuna==4.0.0
 Requires-Dist: transformers==4.40.0
-Requires-Dist: tokenizers==0.19
+Requires-Dist: tokenizers==0.19.0
+Requires-Dist: peft==0.10.0
 Requires-Dist: trl==0.11.4
 Provides-Extra: dev
 Requires-Dist: pre-commit; extra == "dev"
@@ -50,11 +51,57 @@ CEHRGPT is a synthetic data generation model developed to handle structured elec
 To install CEHRGPT, clone this repository and install the required dependencies.
 ```bash
-git clone https://github.com/knatarajan-lab/cehrgpt-public.git
-cd cehrgpt-public
+git clone https://github.com/knatarajan-lab/cehrgpt.git
+cd cehrgpt
 pip install .
 ```
+## Pretrain
+Pretrain cehrgpt using the Hugging Face trainer, the parameters can be found in the sample configuration yaml
+```bash
+mkdir test_results
+# This is NOT required when streaming is set to true
+mkdir test_dataset_prepared
+python -u -m cehrgpt.runners.hf_cehrgpt_pretrain_runner sample_configs/cehrgpt_pretrain_sample_config.yaml
+```
+## Generate synthetic sequences
+Generate synthetic sequences using the trained model
+```bash
+export TRANSFORMERS_VERBOSITY=info
+export CUDA_VISIBLE_DEVICES="0"
+python -u -m cehrgpt.generation.generate_batch_hf_gpt_sequence \
+  --model_folder test_results \
+  --tokenizer_folder test_results \
+  --output_folder test_results \
+  --num_of_patients 128 \
+  --batch_size 32 \
+  --buffer_size 128 \
+  --context_window 1024 \
+  --sampling_strategy TopPStrategy \
+  --top_p 1.0 --temperature 1.0 --repetition_penalty 1.0 \
+  --epsilon_cutoff 0.00 \
+  --demographic_data_path sample_data/pretrain
+```
+## Convert synthetic sequences to OMOP
+```bash
+# omop converter requires the OHDSI vocabulary
+export OMOP_VOCAB_DIR = ""
+# the omop derived tables need to be built using pyspark
+export SPARK_WORKER_INSTANCES="1"
+export SPARK_WORKER_CORES="8"
+export SPARK_EXECUTOR_CORES="2"
+export SPARK_DRIVER_MEMORY="2g"
+export SPARK_EXECUTOR_MEMORY="2g"
+# Convert the sequences, create the omop derived tables
+sh scripts/omop_pipeline.sh \
+  test_results/top_p10000/generated_sequences/ \
+  test_results/top_p10000/restored_omop/ \
+  $OMOP_VOCAB_DIR
+```
 ## Citation
 ```
 @article{cehrgpt2024,
@@ -63,4 +110,3 @@ pip install .
   journal={arXiv preprint arXiv:2402.04400},
   year={2024}
 }
-```

{cehrgpt-0.0.1.dist-info → cehrgpt-0.0.2.dist-info}/RECORD RENAMED Viewed

@@ -13,19 +13,22 @@ cehrgpt/analysis/privacy/utils.py,sha256=CRA4H9mPLBjMQGKzZ_x_3ro3tMap-NjsMDVqSOj
 cehrgpt/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 cehrgpt/data/hf_cehrgpt_dataset.py,sha256=7hvjjqE8WInVuRvAtNkFI_J-xluFBv1Ij4TPTdUxPM4,2570
 cehrgpt/data/hf_cehrgpt_dataset_collator.py,sha256=RYw5Isrwa4sdyQQ3Nf3cu7xPDA3m-c5ecCFf_y1TJKY,20497
-cehrgpt/data/hf_cehrgpt_dataset_mapping.py,sha256=aQ0gsThOFhrh9ExpJhRmuiwN9ShIKheLgCIci-N7HOM,4305
+cehrgpt/data/hf_cehrgpt_dataset_mapping.py,sha256=IjGwLKbEfNPxH3hsNmb8p48_imHnMWtslDK6f7R_1pc,16053
+cehrgpt/data/hf_cehrgpt_dpo_collator.py,sha256=cqDK0SUOt3yAqUHWKGuLVi3WmmUMZ6eyxTv9fC9idZA,2787
+cehrgpt/data/hf_cehrgpt_dpo_dataset_mapping.py,sha256=uCLF5VEsyZAG1aNwqEM6Jy5Lx7bI5ALku52Z6Anine0,2574
 cehrgpt/generation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 cehrgpt/generation/chatgpt_generation.py,sha256=SrnLwHLdNtnAOEg36gNjqfoT9yd12iyPgpZffL2AFJo,4428
 cehrgpt/generation/generate_batch_hf_gpt_sequence.py,sha256=-WLpKlulVVDJSdA2jXyp87gfLW4Q3aAtwULK8fDtn_E,11408
-cehrgpt/generation/omop_converter_batch.py,sha256=SDpWjqzi8dsgVzbbFes42GMdZEvrJ3sm4RbP5UpmIlk,25280
+cehrgpt/generation/generate_paired_cehrgpt_sequence.py,sha256=fLu3SHhRe_ZQfS09ebOktq2dekStgYfxmbrRawZQAO4,8280
+cehrgpt/generation/omop_converter_batch.py,sha256=-c0AlDVy5pJ5Afhr8ERiCHhoRrEk8ozJi3g0yFdWaMI,25348
 cehrgpt/generation/omop_entity.py,sha256=Q5Sr0AlyuPAm1FRPfnJO13q-u1fqRgYVHXruZ9g4xNE,19400
 cehrgpt/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 cehrgpt/models/config.py,sha256=xek4W_siO7WtMAKE7zDsENotsIE70F8dcW-PTC0kBKk,9700
-cehrgpt/models/hf_cehrgpt.py,sha256=YrHhT8c92xcOVTb6FjFQokyHrDOcXgEDMBs0BksSBpA,75739
+cehrgpt/models/hf_cehrgpt.py,sha256=CKseTvGkBFwXK40Z_uKD1_d84oSYCFqKmHI0qtdk72g,75757
 cehrgpt/models/hf_modeling_outputs.py,sha256=LaWa1jI6BRIKMEjWOy1QUeOfTur5y_p2c-JyuGVTdtw,10301
 cehrgpt/models/pretrained_embeddings.py,sha256=vLLVs17TLpXRqCVEWQxGGwPHkUJUO7laNTeBuyBK_yk,3238
 cehrgpt/models/special_tokens.py,sha256=-a7HPJBbdIH0qQ6B3CcRKqvpG6FZlm4nbVPTswGSJ4U,485
-cehrgpt/models/tokenization_hf_cehrgpt.py,sha256=jQR5aHjdHhS14nC1qnqDmybS1gpB27WK2-qVNz9cxW0,42156
+cehrgpt/models/tokenization_hf_cehrgpt.py,sha256=JAZjnmQq-JjUxZK7XIsqdZB07ZB7BC2WraCjpO_6AOM,42161
 cehrgpt/omop/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 cehrgpt/omop/condition_era.py,sha256=hPZALz2XaWnro_1bwIYNkI48foOJjueyg3CZ1BliCno,626
 cehrgpt/omop/observation_period.py,sha256=TRMgv5Ya2RaS2im7oQ6BLC_5JL9EJYNYR62ApxIuHvg,1211
@@ -35,11 +38,17 @@ cehrgpt/omop/sample_omop_tables.py,sha256=2JZ8BNSvssceinwFanvuCRh-YlKrKn25U9w1pL
 cehrgpt/omop/queries/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 cehrgpt/omop/queries/condition_era.py,sha256=LFB6vBAvshHJxtYIRkl7cfrF0kf7ay0piBKpmHBwrpE,2578
 cehrgpt/omop/queries/observation_period.py,sha256=fpzr5DMNw-QLoSwp2Iatfch88E3hyhZ75usiIdG3A0U,6410
+cehrgpt/rl_finetune/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+cehrgpt/rl_finetune/cehrgpt_dpo_trainer.py,sha256=VQHf5vy5i8K1imcqYakhitfAW-d2mnaEzkSoAYSW5kg,26062
+cehrgpt/rl_finetune/cehrgpt_ppo_trainer.py,sha256=nYWYPCaWNjDGEwlo6UHOK1rvOZUx1vuJ8kYuAszI8Zg,17925
+cehrgpt/rl_finetune/ppo_finetune.py,sha256=tSy-C0Kzgj5ffclBIDj-RTj78ZfrLmTESxVxd0n9yuE,13971
+cehrgpt/rl_finetune/ppo_finetune_v2.py,sha256=7dChwKpq4zKmpkcxP4hryqBoIkcwmTJ44_BF8R2RghQ,13285
 cehrgpt/runners/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 cehrgpt/runners/gpt_runner_util.py,sha256=88HKSVj-ADGBCMo7C3znKSMPnAAALa1iU_6P6i9sD0M,3867
-cehrgpt/runners/hf_cehrgpt_finetune_runner.py,sha256=aGw87ZJuUIH196ryaZzt9D4hCAHVcDyKnvvdVPdipwc,31568
-cehrgpt/runners/hf_cehrgpt_pretrain_runner.py,sha256=6xulvnjwy6LDRPIL_zgsYH7sJMiXJ9AvFg3p2o35S6c,16510
-cehrgpt/runners/hf_gpt_runner_argument_dataclass.py,sha256=2l1X5bp1zckoFp0rQkxGptXyG8u3PgNw0dqYVDWLYjg,5155
+cehrgpt/runners/hf_cehrgpt_dpo_runner.py,sha256=Z4qNl9CZFC5YvUBc9ZzdOV5wsBFvMTdxfTn4jjtJQ-Y,4583
+cehrgpt/runners/hf_cehrgpt_finetune_runner.py,sha256=reflNRb6YB6f_3jAfzFAdwKtTl6hvdIp9Jc7DC-Sv-U,32580
+cehrgpt/runners/hf_cehrgpt_pretrain_runner.py,sha256=L3UpjtzxuS8a_tshlqpZN_sXnJSs3yzry0GZNT__05A,18200
+cehrgpt/runners/hf_gpt_runner_argument_dataclass.py,sha256=gKVf4BLzNCFiJR7nZVkf-QRcj8fAEVvIUTV-AVH0g_U,5312
 cehrgpt/runners/hyperparameter_search_util.py,sha256=i4qAb_22JO78l40MSyBPwDgAGuGc96efXmg_833cSSo,9044
 cehrgpt/time_to_event/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 cehrgpt/time_to_event/time_to_event_model.py,sha256=tfXa24l_0q1TBZ68BPRrHRC_3KRWYxrWGIv4myJlIb8,8497
@@ -50,11 +59,12 @@ cehrgpt/time_to_event/config/next_visit_type_prediction.yaml,sha256=WMj2ZutEvHKI
 cehrgpt/time_to_event/config/t2dm_hf.yaml,sha256=_oMQzh2eJTYzEaMOpmhAzbX-qmdsKlkORELL6HxOxHo,202
 cehrgpt/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 cehrgpt/tools/ehrshot_benchmark.py,sha256=E-m_5srlYEw7Y7i9twIJWDvrkwNlop-6yZB-80FZid0,2667
+cehrgpt/tools/generate_causal_patient_split_by_age.py,sha256=dmHiPAL_kR1WrhRteIiHH9dwMtMi3PVl8jXm2O06_gI,4177
 cehrgpt/tools/generate_pretrained_embeddings.py,sha256=lhFSacGv8bMld6qigKZN8Op8eXpFi0DsJuQbWKOWXqI,4160
 cehrgpt/tools/merge_synthetic_real_dataasets.py,sha256=O1dbQ32Le0t15fwymwAh9mfNVLEWuFwW53DNvESrWbY,7589
 cehrgpt/tools/upload_omop_tables.py,sha256=vdBAbkeAsGPA4NsyhNjelPVj3gS8yzmS1sKNM1Qk96g,3791
-cehrgpt-0.0.1.dist-info/LICENSE,sha256=LOfC32zkfUIdGm8e_098jPbt8OHKtNWymDzxn2pA9Zk,1093
-cehrgpt-0.0.1.dist-info/METADATA,sha256=BZrsoZe0Smn4JoA3cCI63fC4nBvOVrC9sgZ0Ct1NJsA,3388
-cehrgpt-0.0.1.dist-info/WHEEL,sha256=nn6H5-ilmfVryoAQl3ZQ2l8SH5imPWFpm1A5FgEuFV4,91
-cehrgpt-0.0.1.dist-info/top_level.txt,sha256=akNCJBbMSLV8nkOzdVzdy13hMJ5CIQURnAS_YYEDVwA,17
-cehrgpt-0.0.1.dist-info/RECORD,,
+cehrgpt-0.0.2.dist-info/LICENSE,sha256=LOfC32zkfUIdGm8e_098jPbt8OHKtNWymDzxn2pA9Zk,1093
+cehrgpt-0.0.2.dist-info/METADATA,sha256=joUmDJWMEBvYphrkwYiK273FwSL9okY74D93ncrbvMU,4878
+cehrgpt-0.0.2.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
+cehrgpt-0.0.2.dist-info/top_level.txt,sha256=akNCJBbMSLV8nkOzdVzdy13hMJ5CIQURnAS_YYEDVwA,17
+cehrgpt-0.0.2.dist-info/RECORD,,

{cehrgpt-0.0.1.dist-info → cehrgpt-0.0.2.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.8.1)
+Generator: setuptools (76.0.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{cehrgpt-0.0.1.dist-info → cehrgpt-0.0.2.dist-info}/LICENSE RENAMED Viewed

File without changes

{cehrgpt-0.0.1.dist-info → cehrgpt-0.0.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

cehrgpt 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl

cehrgpt 0.0.1py3-none-any.whl → 0.0.2py3-none-any.whl