PyPI - cehrgpt - Versions diffs - 0.0.2__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

cehrgpt 0.0.2py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

cehrgpt/analysis/irregularity.py +36 -0
cehrgpt/data/hf_cehrgpt_dataset.py +25 -4
cehrgpt/data/hf_cehrgpt_dataset_collator.py +635 -97
cehrgpt/data/hf_cehrgpt_dataset_mapping.py +308 -95
cehrgpt/data/sample_packing_sampler.py +181 -0
cehrgpt/generation/generate_batch_hf_gpt_sequence.py +12 -9
cehrgpt/generation/omop_converter_batch.py +32 -2
cehrgpt/gpt_utils.py +20 -2
cehrgpt/models/config.py +35 -0
cehrgpt/models/hf_cehrgpt.py +470 -106
cehrgpt/models/hf_modeling_outputs.py +1 -0
cehrgpt/models/special_tokens.py +1 -0
cehrgpt/models/tokenization_hf_cehrgpt.py +358 -71
cehrgpt/runners/data_utils.py +358 -0
cehrgpt/runners/gpt_runner_util.py +0 -10
cehrgpt/runners/hf_cehrgpt_finetune_runner.py +181 -283
cehrgpt/runners/hf_cehrgpt_pretrain_runner.py +288 -112
cehrgpt/runners/hf_gpt_runner_argument_dataclass.py +90 -0
cehrgpt/runners/hyperparameter_search_util.py +10 -8
cehrgpt/runners/sample_packing_trainer.py +185 -0
cehrgpt/simulations/generate_plots.py +95 -0
cehrgpt/simulations/run_simulation.sh +24 -0
cehrgpt/simulations/time_embedding_simulation.py +250 -0
cehrgpt/simulations/time_token_simulation.py +177 -0
cehrgpt/time_to_event/config/1_year_cabg.yaml +23 -0
cehrgpt/time_to_event/time_to_event_model.py +2 -13
cehrgpt/time_to_event/time_to_event_prediction.py +27 -13
cehrgpt/tools/linear_prob/__init__.py +0 -0
cehrgpt/tools/linear_prob/compute_cehrgpt_features.py +495 -0
cehrgpt/tools/linear_prob/train_with_cehrgpt_features.py +152 -0
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info}/METADATA +11 -8
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info}/RECORD +36 -32
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info}/WHEEL +1 -1
cehrgpt/data/hf_cehrgpt_dpo_collator.py +0 -71
cehrgpt/data/hf_cehrgpt_dpo_dataset_mapping.py +0 -61
cehrgpt/generation/generate_paired_cehrgpt_sequence.py +0 -224
cehrgpt/rl_finetune/cehrgpt_dpo_trainer.py +0 -586
cehrgpt/rl_finetune/cehrgpt_ppo_trainer.py +0 -464
cehrgpt/rl_finetune/ppo_finetune.py +0 -394
cehrgpt/rl_finetune/ppo_finetune_v2.py +0 -373
cehrgpt/runners/hf_cehrgpt_dpo_runner.py +0 -119
/cehrgpt/{rl_finetune → simulations}/__init__.py +0 -0
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info/licenses}/LICENSE +0 -0
{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info}/top_level.txt +0 -0

{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info}/RECORD RENAMED Viewed

@@ -1,8 +1,9 @@
 __init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 cehrgpt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 cehrgpt/cehrgpt_args.py,sha256=zPLp9Qjlq5PapWx3R15BNnyaX8zV3dxr4PuWj71r0Lg,3516
-cehrgpt/gpt_utils.py,sha256=bksHCXMX4j859VSv1Q284rVr4gn1Y8dCx4a_V-g4mug,10939
+cehrgpt/gpt_utils.py,sha256=IA5qw-hxcKkGO07AB47lDNRU6mlb9jblpKO7KeLLN78,11342
 cehrgpt/analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+cehrgpt/analysis/irregularity.py,sha256=Rfl_daMvSh9cZ68vUwfmuH-JYCFXdAph2ITHHffYC0Y,1047
 cehrgpt/analysis/privacy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 cehrgpt/analysis/privacy/attribute_inference.py,sha256=0ANVW0I5uvOl6IxQ15-vMVQd0mugOgSGReBUQQESImg,9368
 cehrgpt/analysis/privacy/attribute_inference_config.yml,sha256=hfLfpBlDqqsNOynpRHK414vV24edKA6ta-inmEhM2ao,103272
@@ -11,24 +12,22 @@ cehrgpt/analysis/privacy/nearest_neighbor_inference.py,sha256=qoJgWW7VsUMzjMGpTa
 cehrgpt/analysis/privacy/reid_inference.py,sha256=Pypd3QJXQNY8VljpnIEa5zeAbTZHMjQOazaL-9VsBGw,13955
 cehrgpt/analysis/privacy/utils.py,sha256=CRA4H9mPLBjMQGKzZ_x_3ro3tMap-NjsMDVqSOjHSVQ,8226
 cehrgpt/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-cehrgpt/data/hf_cehrgpt_dataset.py,sha256=7hvjjqE8WInVuRvAtNkFI_J-xluFBv1Ij4TPTdUxPM4,2570
-cehrgpt/data/hf_cehrgpt_dataset_collator.py,sha256=RYw5Isrwa4sdyQQ3Nf3cu7xPDA3m-c5ecCFf_y1TJKY,20497
-cehrgpt/data/hf_cehrgpt_dataset_mapping.py,sha256=IjGwLKbEfNPxH3hsNmb8p48_imHnMWtslDK6f7R_1pc,16053
-cehrgpt/data/hf_cehrgpt_dpo_collator.py,sha256=cqDK0SUOt3yAqUHWKGuLVi3WmmUMZ6eyxTv9fC9idZA,2787
-cehrgpt/data/hf_cehrgpt_dpo_dataset_mapping.py,sha256=uCLF5VEsyZAG1aNwqEM6Jy5Lx7bI5ALku52Z6Anine0,2574
+cehrgpt/data/hf_cehrgpt_dataset.py,sha256=hwJlGW7XiJIr6cXtmwvReQf9yLZJPD-dvJGvRg5ERqU,3755
+cehrgpt/data/hf_cehrgpt_dataset_collator.py,sha256=ACMXiaYnR3bKD5dRleL0_siEvhL-2HAFcy5eBgvxnH4,44412
+cehrgpt/data/hf_cehrgpt_dataset_mapping.py,sha256=KU0WMjc2vT1zBAl7JJkOc8dgGxsL1uFDy4dDrv-RkII,25668
+cehrgpt/data/sample_packing_sampler.py,sha256=vovGMtmhG70DRkSCeiaDEJ_rjKZ38y-YLaI1kkhFEkI,6747
 cehrgpt/generation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 cehrgpt/generation/chatgpt_generation.py,sha256=SrnLwHLdNtnAOEg36gNjqfoT9yd12iyPgpZffL2AFJo,4428
-cehrgpt/generation/generate_batch_hf_gpt_sequence.py,sha256=-WLpKlulVVDJSdA2jXyp87gfLW4Q3aAtwULK8fDtn_E,11408
-cehrgpt/generation/generate_paired_cehrgpt_sequence.py,sha256=fLu3SHhRe_ZQfS09ebOktq2dekStgYfxmbrRawZQAO4,8280
-cehrgpt/generation/omop_converter_batch.py,sha256=-c0AlDVy5pJ5Afhr8ERiCHhoRrEk8ozJi3g0yFdWaMI,25348
+cehrgpt/generation/generate_batch_hf_gpt_sequence.py,sha256=uSEh8aMmPD61nGewIaPSkIqm-2AxDjCBiu4cBfxHxU4,11503
+cehrgpt/generation/omop_converter_batch.py,sha256=LUmCD-t_6ZP1YfNDZCqYewl-XIIaIgRZ_dAxuR_VdCQ,26275
 cehrgpt/generation/omop_entity.py,sha256=Q5Sr0AlyuPAm1FRPfnJO13q-u1fqRgYVHXruZ9g4xNE,19400
 cehrgpt/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-cehrgpt/models/config.py,sha256=xek4W_siO7WtMAKE7zDsENotsIE70F8dcW-PTC0kBKk,9700
-cehrgpt/models/hf_cehrgpt.py,sha256=CKseTvGkBFwXK40Z_uKD1_d84oSYCFqKmHI0qtdk72g,75757
-cehrgpt/models/hf_modeling_outputs.py,sha256=LaWa1jI6BRIKMEjWOy1QUeOfTur5y_p2c-JyuGVTdtw,10301
+cehrgpt/models/config.py,sha256=nOAKgH5420HLCcy7n1hE7MbqR861Iq4DTutKoAd25tg,11090
+cehrgpt/models/hf_cehrgpt.py,sha256=77CAkdMPgxD4xSpFU7gYGzRn6_Iv-4q7FnHpnZGsKxw,92450
+cehrgpt/models/hf_modeling_outputs.py,sha256=5X4WEYKqT37phv_e5ZAv3A_N0wqdAUJLJRm6TxS6dDQ,10356
 cehrgpt/models/pretrained_embeddings.py,sha256=vLLVs17TLpXRqCVEWQxGGwPHkUJUO7laNTeBuyBK_yk,3238
-cehrgpt/models/special_tokens.py,sha256=-a7HPJBbdIH0qQ6B3CcRKqvpG6FZlm4nbVPTswGSJ4U,485
-cehrgpt/models/tokenization_hf_cehrgpt.py,sha256=JAZjnmQq-JjUxZK7XIsqdZB07ZB7BC2WraCjpO_6AOM,42161
+cehrgpt/models/special_tokens.py,sha256=lrw45B4tea4Dsajn09Cz6w5D2TfHmYXikZkgwnstu_o,521
+cehrgpt/models/tokenization_hf_cehrgpt.py,sha256=cAxHTctpVBxfWfC3XcwDQavN1zwWN9Nid_Fajd5zQWQ,53159
 cehrgpt/omop/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 cehrgpt/omop/condition_era.py,sha256=hPZALz2XaWnro_1bwIYNkI48foOJjueyg3CZ1BliCno,626
 cehrgpt/omop/observation_period.py,sha256=TRMgv5Ya2RaS2im7oQ6BLC_5JL9EJYNYR62ApxIuHvg,1211
@@ -38,22 +37,24 @@ cehrgpt/omop/sample_omop_tables.py,sha256=2JZ8BNSvssceinwFanvuCRh-YlKrKn25U9w1pL
 cehrgpt/omop/queries/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 cehrgpt/omop/queries/condition_era.py,sha256=LFB6vBAvshHJxtYIRkl7cfrF0kf7ay0piBKpmHBwrpE,2578
 cehrgpt/omop/queries/observation_period.py,sha256=fpzr5DMNw-QLoSwp2Iatfch88E3hyhZ75usiIdG3A0U,6410
-cehrgpt/rl_finetune/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-cehrgpt/rl_finetune/cehrgpt_dpo_trainer.py,sha256=VQHf5vy5i8K1imcqYakhitfAW-d2mnaEzkSoAYSW5kg,26062
-cehrgpt/rl_finetune/cehrgpt_ppo_trainer.py,sha256=nYWYPCaWNjDGEwlo6UHOK1rvOZUx1vuJ8kYuAszI8Zg,17925
-cehrgpt/rl_finetune/ppo_finetune.py,sha256=tSy-C0Kzgj5ffclBIDj-RTj78ZfrLmTESxVxd0n9yuE,13971
-cehrgpt/rl_finetune/ppo_finetune_v2.py,sha256=7dChwKpq4zKmpkcxP4hryqBoIkcwmTJ44_BF8R2RghQ,13285
 cehrgpt/runners/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-cehrgpt/runners/gpt_runner_util.py,sha256=88HKSVj-ADGBCMo7C3znKSMPnAAALa1iU_6P6i9sD0M,3867
-cehrgpt/runners/hf_cehrgpt_dpo_runner.py,sha256=Z4qNl9CZFC5YvUBc9ZzdOV5wsBFvMTdxfTn4jjtJQ-Y,4583
-cehrgpt/runners/hf_cehrgpt_finetune_runner.py,sha256=reflNRb6YB6f_3jAfzFAdwKtTl6hvdIp9Jc7DC-Sv-U,32580
-cehrgpt/runners/hf_cehrgpt_pretrain_runner.py,sha256=L3UpjtzxuS8a_tshlqpZN_sXnJSs3yzry0GZNT__05A,18200
-cehrgpt/runners/hf_gpt_runner_argument_dataclass.py,sha256=gKVf4BLzNCFiJR7nZVkf-QRcj8fAEVvIUTV-AVH0g_U,5312
-cehrgpt/runners/hyperparameter_search_util.py,sha256=i4qAb_22JO78l40MSyBPwDgAGuGc96efXmg_833cSSo,9044
+cehrgpt/runners/data_utils.py,sha256=I6k1TkiiZR8ggw3eVO16g2lVPY-Hu3b-nbrIOKlFIO0,15528
+cehrgpt/runners/gpt_runner_util.py,sha256=YJQSRW9Mo4TjXSOUOTf6BUFcs1MGFiXU5T4ztKZcYhU,3485
+cehrgpt/runners/hf_cehrgpt_finetune_runner.py,sha256=GVbHHqf5TWGbVWlQG-XurgYH8pKRjTk8ug_ib9L9U7E,28118
+cehrgpt/runners/hf_cehrgpt_pretrain_runner.py,sha256=ERSnvB38fPYVghtKQeNTZ8VfeXnoRcCHB0cWISWaZ84,26523
+cehrgpt/runners/hf_gpt_runner_argument_dataclass.py,sha256=ejAFLM9g765p1fyeF5MITsiIeWHKkz9wTeFDeVgxSto,8851
+cehrgpt/runners/hyperparameter_search_util.py,sha256=YWdFQ1igQs-G_wqWUrUzYraGiz8OSpSYyvid-I5nhWA,9262
+cehrgpt/runners/sample_packing_trainer.py,sha256=Zb7Aqwnk8-VqrjEKUVeg5XzZWmHxXOU2sDn1YURS-FU,7960
+cehrgpt/simulations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+cehrgpt/simulations/generate_plots.py,sha256=BTZ71r8Kah0PMorkiO3vw55_p_9U1Z8KiD3GsPfaV0s,2520
+cehrgpt/simulations/run_simulation.sh,sha256=DcJ6B19jIteUO0pZ0Tc21876lB9XxQHFAxlre7MtAzk,795
+cehrgpt/simulations/time_embedding_simulation.py,sha256=HZ-imXH-bN-QYZN1PAfcERmNtaWIwKjbf0UrZduwCiA,8687
+cehrgpt/simulations/time_token_simulation.py,sha256=sLg8vVXydvR_zk3BbqyrlA7sDIdhFnS-s5pSKcCilSc,6057
 cehrgpt/time_to_event/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-cehrgpt/time_to_event/time_to_event_model.py,sha256=tfXa24l_0q1TBZ68BPRrHRC_3KRWYxrWGIv4myJlIb8,8497
-cehrgpt/time_to_event/time_to_event_prediction.py,sha256=Ajesq2gSsILghWHCTLiiBhcyOCa7m6JPPMdi_xvBlR4,12624
+cehrgpt/time_to_event/time_to_event_model.py,sha256=Plm0bZxvlAbnMl82DTBXWvaXLvrqcdkzcP_celX8WC4,8055
+cehrgpt/time_to_event/time_to_event_prediction.py,sha256=W2e7UqIV7ELdfTy997HS66vggjnhdncCKt840knI0Dw,13183
 cehrgpt/time_to_event/time_to_event_utils.py,sha256=KN4hwGgxy2nJtO7osbYQBF3-HpmGUWefNfexzPYiEwc,1937
+cehrgpt/time_to_event/config/1_year_cabg.yaml,sha256=SFF2-F5D02pDSMRddDrEUoERBCd0t2Hzln_xC-Mo2hA,407
 cehrgpt/time_to_event/config/30_day_readmission.yaml,sha256=Hn5KnEXMtSV_CtCpmAU4wjkc0-gTXvniaH991TSbUXA,234
 cehrgpt/time_to_event/config/next_visit_type_prediction.yaml,sha256=WMj2ZutEvHKIMyGG51xtXaL6MyRANKvpg9xT8ouctLc,319
 cehrgpt/time_to_event/config/t2dm_hf.yaml,sha256=_oMQzh2eJTYzEaMOpmhAzbX-qmdsKlkORELL6HxOxHo,202
@@ -63,8 +64,11 @@ cehrgpt/tools/generate_causal_patient_split_by_age.py,sha256=dmHiPAL_kR1WrhRteIi
 cehrgpt/tools/generate_pretrained_embeddings.py,sha256=lhFSacGv8bMld6qigKZN8Op8eXpFi0DsJuQbWKOWXqI,4160
 cehrgpt/tools/merge_synthetic_real_dataasets.py,sha256=O1dbQ32Le0t15fwymwAh9mfNVLEWuFwW53DNvESrWbY,7589
 cehrgpt/tools/upload_omop_tables.py,sha256=vdBAbkeAsGPA4NsyhNjelPVj3gS8yzmS1sKNM1Qk96g,3791
-cehrgpt-0.0.2.dist-info/LICENSE,sha256=LOfC32zkfUIdGm8e_098jPbt8OHKtNWymDzxn2pA9Zk,1093
-cehrgpt-0.0.2.dist-info/METADATA,sha256=joUmDJWMEBvYphrkwYiK273FwSL9okY74D93ncrbvMU,4878
-cehrgpt-0.0.2.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
-cehrgpt-0.0.2.dist-info/top_level.txt,sha256=akNCJBbMSLV8nkOzdVzdy13hMJ5CIQURnAS_YYEDVwA,17
-cehrgpt-0.0.2.dist-info/RECORD,,
+cehrgpt/tools/linear_prob/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+cehrgpt/tools/linear_prob/compute_cehrgpt_features.py,sha256=q0rmlBWDDEkjHjwcTouGUhCYa32a1vRicaDOAMsdW0I,20741
+cehrgpt/tools/linear_prob/train_with_cehrgpt_features.py,sha256=w0UvzMKYGenN_KDVnbzutmy8IPLUxW5hPvpKKxDSL5U,5820
+cehrgpt-0.1.1.dist-info/licenses/LICENSE,sha256=LOfC32zkfUIdGm8e_098jPbt8OHKtNWymDzxn2pA9Zk,1093
+cehrgpt-0.1.1.dist-info/METADATA,sha256=VnXH74vJQZaV7VxGiIvJnFhQA0jzJQNx86yHFkygobM,4922
+cehrgpt-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+cehrgpt-0.1.1.dist-info/top_level.txt,sha256=akNCJBbMSLV8nkOzdVzdy13hMJ5CIQURnAS_YYEDVwA,17
+cehrgpt-0.1.1.dist-info/RECORD,,

{cehrgpt-0.0.2.dist-info → cehrgpt-0.1.1.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (76.0.0)
+Generator: setuptools (80.9.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

cehrgpt/data/hf_cehrgpt_dpo_collator.py DELETED Viewed

@@ -1,71 +0,0 @@
-import torch
-from torch.nn.utils.rnn import pad_sequence
-from cehrgpt.data.hf_cehrgpt_dataset_collator import CehrGptDataCollator
-class CehrGptDPODataCollator(CehrGptDataCollator):
-    def create_preference_inputs(self, examples, prefix):
-        batch = {}
-        # Assume that each example in the batch is a dictionary with 'input_ids' and 'attention_mask'
-        batch_input_ids = [
-            self._try_reverse_tensor(
-                self._convert_to_tensor(example[f"{prefix}_input_ids"])
-            )
-            for example in examples
-        ]
-        batch_attention_mask = [
-            self._try_reverse_tensor(
-                torch.ones_like(
-                    self._convert_to_tensor(example[f"{prefix}_input_ids"]),
-                    dtype=torch.float,
-                )
-            )
-            for example in examples
-        ]
-        # Pad sequences to the max length in the batch
-        batch[f"{prefix}_input_ids"] = self._try_reverse_tensor(
-            pad_sequence(
-                batch_input_ids,
-                batch_first=True,
-                padding_value=self.tokenizer.pad_token_id,
-            ).to(torch.int64)
-        )
-        batch[f"{prefix}_attention_mask"] = self._try_reverse_tensor(
-            pad_sequence(batch_attention_mask, batch_first=True, padding_value=0.0)
-        )
-        assert batch[f"{prefix}_input_ids"].shape[1] <= self.max_length
-        assert batch[f"{prefix}_attention_mask"].shape[1] <= self.max_length
-        if self.include_values:
-            batch_value_indicators = [
-                self._try_reverse_tensor(
-                    self._convert_to_tensor(example[f"{prefix}_value_indicators"])
-                )
-                for example in examples
-            ]
-            batch_values = [
-                self._try_reverse_tensor(
-                    self._convert_to_tensor(example[f"{prefix}__values"])
-                )
-                for example in examples
-            ]
-            batch[f"{prefix}_value_indicators"] = self._try_reverse_tensor(
-                pad_sequence(
-                    batch_value_indicators, batch_first=True, padding_value=False
-                )
-            )
-            batch[f"{prefix}_values"] = self._try_reverse_tensor(
-                pad_sequence(batch_values, batch_first=True, padding_value=-1.0)
-            )
-            assert batch[f"{prefix}_value_indicators"].shape[1] <= self.max_length
-            assert batch[f"{prefix}_values"].shape[1] <= self.max_length
-        return batch
-    def __call__(self, examples):
-        batch_chosen = self.create_preference_inputs(examples, "chosen")
-        batch_rejected = self.create_preference_inputs(examples, "rejected")
-        batch_chosen.update(batch_rejected)
-        return batch_chosen

cehrgpt/data/hf_cehrgpt_dpo_dataset_mapping.py DELETED Viewed

@@ -1,61 +0,0 @@
-import copy
-from typing import Any, Dict
-import numpy as np
-from cehrbert.data_generators.hf_data_generator.hf_dataset_mapping import DatasetMapping
-from cehrgpt.models.tokenization_hf_cehrgpt import CehrGptTokenizer
-class HFCehrGptDPOTokenizationMapping(DatasetMapping):
-    def __init__(
-        self,
-        concept_tokenizer: CehrGptTokenizer,
-    ):
-        self._concept_tokenizer = concept_tokenizer
-        self._lab_token_ids = self._concept_tokenizer.lab_token_ids
-    def transform_with_prefix(self, record: Dict[str, Any], prefix) -> Dict[str, Any]:
-        concept_ids = record[f"{prefix}_concept_ids"]
-        input_ids = self._concept_tokenizer.encode(concept_ids)
-        record[f"{prefix}_input_ids"] = input_ids
-        if f"{prefix}_concept_value_masks" in record:
-            concept_value_masks = record[f"{prefix}_concept_value_masks"]
-            concept_values = record[f"{prefix}_concept_values"]
-            # If any concept has a value associated with it, we normalize the value
-            if np.any(np.asarray(concept_value_masks) > 0):
-                units = record[f"{prefix}_units"]
-                normalized_concept_values = copy.deepcopy(concept_values)
-                for i, (
-                    concept_id,
-                    unit,
-                    token_id,
-                    concept_value_mask,
-                    concept_value,
-                ) in enumerate(
-                    zip(
-                        concept_ids,
-                        units,
-                        input_ids,
-                        concept_value_masks,
-                        concept_values,
-                    )
-                ):
-                    if token_id in self._lab_token_ids:
-                        normalized_concept_value = self._concept_tokenizer.normalize(
-                            concept_id, unit, concept_value
-                        )
-                        normalized_concept_values[i] = normalized_concept_value
-                record[f"{prefix}_concept_values"] = normalized_concept_values
-                # Overwrite the column names
-                record[f"{prefix}_value_indicators"] = record[
-                    f"{prefix}_concept_value_masks"
-                ]
-                record[f"{prefix}_values"] = record[f"{prefix}_concept_values"]
-        return record
-    def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
-        record = self.transform_with_prefix(record, prefix="chosen")
-        record.update(self.transform_with_prefix(record, prefix="rejected"))
-        return record

cehrgpt/generation/generate_paired_cehrgpt_sequence.py DELETED Viewed

@@ -1,224 +0,0 @@
-import datetime
-import os
-import random
-import uuid
-import pandas as pd
-import torch
-from cehrbert.runners.runner_util import load_parquet_as_dataset
-from transformers.utils import is_flash_attn_2_available, logging
-from cehrgpt.cehrgpt_args import create_inference_base_arg_parser
-from cehrgpt.generation.generate_batch_hf_gpt_sequence import (
-    generate_single_batch,
-    normalize_value,
-)
-from cehrgpt.gpt_utils import get_cehrgpt_output_folder
-from cehrgpt.models.hf_cehrgpt import CEHRGPT2LMHeadModel
-from cehrgpt.models.tokenization_hf_cehrgpt import CehrGptTokenizer
-LOG = logging.get_logger("transformers")
-def main(args):
-    if torch.cuda.is_available():
-        device = torch.device("cuda")
-    else:
-        device = torch.device("cpu")
-    cehrgpt_tokenizer = CehrGptTokenizer.from_pretrained(args.tokenizer_folder)
-    cehrgpt_model = (
-        CEHRGPT2LMHeadModel.from_pretrained(
-            args.model_folder,
-            attn_implementation=(
-                "flash_attention_2" if is_flash_attn_2_available() else "eager"
-            ),
-            torch_dtype=(
-                torch.bfloat16 if is_flash_attn_2_available() else torch.float32
-            ),
-        )
-        .eval()
-        .to(device)
-    )
-    cehrgpt_model.generation_config.pad_token_id = cehrgpt_tokenizer.pad_token_id
-    cehrgpt_model.generation_config.eos_token_id = cehrgpt_tokenizer.end_token_id
-    cehrgpt_model.generation_config.bos_token_id = cehrgpt_tokenizer.end_token_id
-    folder_name = get_cehrgpt_output_folder(args, cehrgpt_tokenizer)
-    output_folder_name = os.path.join(
-        args.output_folder, folder_name, "generated_sequences"
-    )
-    if not os.path.exists(output_folder_name):
-        os.makedirs(output_folder_name)
-    LOG.info(f"Loading tokenizer at {args.model_folder}")
-    LOG.info(f"Loading model at {args.model_folder}")
-    LOG.info(f"Write sequences to {output_folder_name}")
-    LOG.info(f"Context window {args.context_window}")
-    LOG.info(f"Temperature {args.temperature}")
-    LOG.info(f"Repetition Penalty {args.repetition_penalty}")
-    LOG.info(f"Sampling Strategy {args.sampling_strategy}")
-    LOG.info(f"Num beam {args.num_beams}")
-    LOG.info(f"Num beam groups {args.num_beam_groups}")
-    LOG.info(f"Epsilon cutoff {args.epsilon_cutoff}")
-    LOG.info(f"Top P {args.top_p}")
-    LOG.info(f"Top K {args.top_k}")
-    LOG.info(f"Loading sequence_data_path at {args.sequence_data_path}")
-    dataset = load_parquet_as_dataset(args.sequence_data_path)
-    total_rows = len(dataset)
-    float(args.batch_size) / total_rows
-    num_of_batches = args.num_of_patients // args.batch_size + 1
-    sequence_to_flush = []
-    for i in range(num_of_batches):
-        LOG.info(f"{datetime.datetime.now()}: Batch {i} started")
-        sample_data = []
-        while len(sample_data) == 0:
-            random_indices = random.sample(range(total_rows), k=1)
-            for row in dataset.select(random_indices):
-                if 4 <= len(row["concept_ids"]) <= cehrgpt_model.config.n_positions:
-                    sample_data.append(row)
-        prompts = []
-        chosen_responses = []
-        cutoff_frac = random.uniform(0, args.cutoff_frac_max)
-        for row in sample_data:
-            seq_len = len(row["concept_ids"])
-            prompt_len = max(4, int(seq_len * cutoff_frac))
-            prompts.append(cehrgpt_tokenizer.encode(row["concept_ids"][:prompt_len]))
-            chosen_responses.append(
-                {
-                    "person_id": row["person_id"],
-                    "chosen_concept_ids": (
-                        row["concept_ids"] if "concept_ids" in row else None
-                    ),
-                    "chosen_concept_values": (
-                        row["concept_values"] if "concept_values" in row else None
-                    ),
-                    "chosen_concept_value_masks": (
-                        row["concept_value_masks"]
-                        if "concept_value_masks" in row
-                        else None
-                    ),
-                    "chosen_units": row["units"] if "units" in row else None,
-                    "prompt_length": prompt_len,
-                }
-            )
-        batch_sequences = generate_single_batch(
-            cehrgpt_model,
-            cehrgpt_tokenizer,
-            prompts=prompts,
-            max_new_tokens=args.context_window,
-            mini_num_of_concepts=args.min_num_of_concepts,
-            top_p=args.top_p,
-            top_k=args.top_k,
-            temperature=args.temperature,
-            repetition_penalty=args.repetition_penalty,
-            num_beams=args.num_beams,
-            num_beam_groups=args.num_beam_groups,
-            epsilon_cutoff=args.epsilon_cutoff,
-            device=device,
-        )
-        # Clear the cache
-        torch.cuda.empty_cache()
-        for seq, value_indicator, value, chosen_response in zip(
-            batch_sequences["sequences"],
-            batch_sequences["value_indicators"],
-            batch_sequences["values"],
-            chosen_responses,
-        ):
-            output = {"rejected_concept_ids": seq}
-            normalized_values, units = normalize_value(
-                seq, value_indicator, value, cehrgpt_tokenizer
-            )
-            if normalized_values is not None:
-                output["rejected_concept_values"] = normalized_values
-            if value_indicator is not None:
-                output["rejected_concept_value_masks"] = value_indicator
-            if units is not None:
-                output["rejected_units"] = units
-            output.update(chosen_response)
-            sequence_to_flush.append(output)
-        if len(sequence_to_flush) >= args.buffer_size:
-            LOG.info(f"{datetime.datetime.now()}: Flushing to the Disk at Batch {i}")
-            pd.DataFrame(
-                sequence_to_flush,
-                columns=[
-                    "person_id",
-                    "chosen_concept_ids",
-                    "chosen_concept_values",
-                    "chosen_concept_value_masks",
-                    "chosen_units",
-                    "prompt_length",
-                    "rejected_concept_ids",
-                    "rejected_concept_values",
-                    "rejected_concept_value_masks",
-                    "rejected_units",
-                ],
-            ).to_parquet(os.path.join(output_folder_name, f"{uuid.uuid4()}.parquet"))
-            sequence_to_flush.clear()
-    if len(sequence_to_flush) > 0:
-        LOG.info(f"{datetime.datetime.now()}: Flushing to the Disk at Final Batch")
-        pd.DataFrame(
-            sequence_to_flush,
-            columns=[
-                "person_id",
-                "chosen_concept_ids",
-                "chosen_concept_values",
-                "chosen_concept_value_masks",
-                "chosen_units",
-                "prompt_length",
-                "rejected_concept_ids",
-                "rejected_concept_values",
-                "rejected_concept_value_masks",
-                "rejected_units",
-            ],
-        ).to_parquet(os.path.join(output_folder_name, f"{uuid.uuid4()}-last.parquet"))
-def create_arg_parser():
-    base_arg_parser = create_inference_base_arg_parser(
-        description="Arguments for generating paired patient sequences"
-    )
-    base_arg_parser.add_argument(
-        "--num_of_patients",
-        dest="num_of_patients",
-        action="store",
-        type=int,
-        help="The number of patients that will be generated",
-        required=True,
-    )
-    base_arg_parser.add_argument(
-        "--sequence_data_path",
-        dest="sequence_data_path",
-        action="store",
-        help="The path for your sequence data",
-        required=True,
-    )
-    base_arg_parser.add_argument(
-        "--cutoff_frac_max",
-        dest="cutoff_frac_max",
-        action="store",
-        type=float,
-        help="The max fraction of the patient sequences that will be used for prompting",
-        required=False,
-        default=0.5,
-    )
-    base_arg_parser.add_argument(
-        "--num_proc",
-        dest="num_proc",
-        action="store",
-        type=int,
-        required=False,
-        default=1,
-    )
-    return base_arg_parser
-if __name__ == "__main__":
-    main(create_arg_parser().parse_args())

cehrgpt 0.0.2__py3-none-any.whl → 0.1.1__py3-none-any.whl

cehrgpt 0.0.2py3-none-any.whl → 0.1.1py3-none-any.whl