cehrgpt 0.0.1__tar.gz → 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. cehrgpt-0.1.0/.gitignore +28 -0
  2. {cehrgpt-0.0.1/src/cehrgpt.egg-info → cehrgpt-0.1.0}/PKG-INFO +57 -9
  3. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/README.md +49 -4
  4. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/pyproject.toml +6 -4
  5. cehrgpt-0.1.0/sample_configs/cehrgpt_pretrain_sample_config.yaml +51 -0
  6. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/scripts/level_three_evaluation.sh +10 -6
  7. cehrgpt-0.1.0/scripts/omop_pipeline.sh +55 -0
  8. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/data/hf_cehrgpt_dataset.py +24 -4
  9. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/data/hf_cehrgpt_dataset_collator.py +260 -84
  10. cehrgpt-0.1.0/src/cehrgpt/data/hf_cehrgpt_dataset_mapping.py +393 -0
  11. cehrgpt-0.1.0/src/cehrgpt/data/sample_packing_sampler.py +151 -0
  12. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/generation/generate_batch_hf_gpt_sequence.py +12 -9
  13. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/generation/omop_converter_batch.py +3 -0
  14. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/models/config.py +10 -0
  15. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/models/hf_cehrgpt.py +244 -73
  16. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/models/tokenization_hf_cehrgpt.py +6 -2
  17. cehrgpt-0.1.0/src/cehrgpt/runners/data_utils.py +243 -0
  18. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/runners/gpt_runner_util.py +0 -10
  19. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/runners/hf_cehrgpt_finetune_runner.py +154 -260
  20. cehrgpt-0.1.0/src/cehrgpt/runners/hf_cehrgpt_pretrain_runner.py +530 -0
  21. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/runners/hf_gpt_runner_argument_dataclass.py +46 -0
  22. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/runners/hyperparameter_search_util.py +4 -1
  23. cehrgpt-0.1.0/src/cehrgpt/runners/sample_packing_trainer.py +168 -0
  24. cehrgpt-0.1.0/src/cehrgpt/simulations/generate_plots.py +95 -0
  25. cehrgpt-0.1.0/src/cehrgpt/simulations/run_simulation.sh +24 -0
  26. cehrgpt-0.1.0/src/cehrgpt/simulations/time_embedding_simulation.py +250 -0
  27. cehrgpt-0.1.0/src/cehrgpt/simulations/time_token_simulation.py +177 -0
  28. cehrgpt-0.1.0/src/cehrgpt/tools/generate_causal_patient_split_by_age.py +146 -0
  29. cehrgpt-0.1.0/src/cehrgpt/tools/linear_prob/compute_cehrgpt_features.py +467 -0
  30. cehrgpt-0.1.0/src/cehrgpt/tools/linear_prob/train_with_cehrgpt_features.py +152 -0
  31. {cehrgpt-0.0.1 → cehrgpt-0.1.0/src/cehrgpt.egg-info}/PKG-INFO +57 -9
  32. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt.egg-info/SOURCES.txt +15 -0
  33. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt.egg-info/requires.txt +6 -4
  34. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/tests/integration_tests/runners/hf_cehrgpt_pretrain_runner_test.py +15 -5
  35. cehrgpt-0.1.0/tests/integration_tests/runners/hf_cehrgpt_pretrain_sample_packing_runner_test.py +115 -0
  36. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/tests/integration_tests/runners/hf_cehrgpt_pretrain_sfm_runner_test.py +9 -3
  37. cehrgpt-0.1.0/tests/unit_tests/models/model_utils_test.py +131 -0
  38. cehrgpt-0.1.0/tests/unit_tests/runners/__init__.py +0 -0
  39. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/tests/unit_tests/runners/hf_cehrgpt_finetune_runner_test.py +4 -4
  40. cehrgpt-0.1.0/tests/unit_tests/tools/__init__.py +0 -0
  41. cehrgpt-0.0.1/.gitignore +0 -38
  42. cehrgpt-0.0.1/scripts/omop_pipeline.sh +0 -73
  43. cehrgpt-0.0.1/src/cehrgpt/data/hf_cehrgpt_dataset_mapping.py +0 -116
  44. cehrgpt-0.0.1/src/cehrgpt/runners/hf_cehrgpt_pretrain_runner.py +0 -370
  45. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/.github/workflows/build-python.yaml +0 -0
  46. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/.github/workflows/tests.yaml +0 -0
  47. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/.pre-commit-config.yaml +0 -0
  48. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/LICENSE +0 -0
  49. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/sample_data/pretrain/patient_sequence.parquet +0 -0
  50. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/sample_data/pretrained_embeddings/pretrained_embedding_concepts.pkl +0 -0
  51. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/sample_data/pretrained_embeddings/pretrained_embedding_vectors.npy +0 -0
  52. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/scripts/pool_generated_sequences.sh +0 -0
  53. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/setup.cfg +0 -0
  54. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/__init__.py +0 -0
  55. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/__init__.py +0 -0
  56. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/analysis/__init__.py +0 -0
  57. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/analysis/privacy/__init__.py +0 -0
  58. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/analysis/privacy/attribute_inference.py +0 -0
  59. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/analysis/privacy/attribute_inference_config.yml +0 -0
  60. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/analysis/privacy/member_inference.py +0 -0
  61. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/analysis/privacy/nearest_neighbor_inference.py +0 -0
  62. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/analysis/privacy/reid_inference.py +0 -0
  63. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/analysis/privacy/utils.py +0 -0
  64. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/cehrgpt_args.py +0 -0
  65. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/data/__init__.py +0 -0
  66. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/generation/__init__.py +0 -0
  67. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/generation/chatgpt_generation.py +0 -0
  68. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/generation/omop_entity.py +0 -0
  69. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/gpt_utils.py +0 -0
  70. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/models/__init__.py +0 -0
  71. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/models/hf_modeling_outputs.py +0 -0
  72. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/models/pretrained_embeddings.py +0 -0
  73. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/models/special_tokens.py +0 -0
  74. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/omop/__init__.py +0 -0
  75. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/omop/condition_era.py +0 -0
  76. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/omop/observation_period.py +0 -0
  77. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/omop/omop_argparse.py +0 -0
  78. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/omop/omop_table_builder.py +0 -0
  79. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/omop/queries/__init__.py +0 -0
  80. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/omop/queries/condition_era.py +0 -0
  81. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/omop/queries/observation_period.py +0 -0
  82. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/omop/sample_omop_tables.py +0 -0
  83. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/runners/__init__.py +0 -0
  84. {cehrgpt-0.0.1/src/cehrgpt/time_to_event → cehrgpt-0.1.0/src/cehrgpt/simulations}/__init__.py +0 -0
  85. {cehrgpt-0.0.1/src/cehrgpt/tools → cehrgpt-0.1.0/src/cehrgpt/time_to_event}/__init__.py +0 -0
  86. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/time_to_event/config/30_day_readmission.yaml +0 -0
  87. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/time_to_event/config/next_visit_type_prediction.yaml +0 -0
  88. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/time_to_event/config/t2dm_hf.yaml +0 -0
  89. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/time_to_event/time_to_event_model.py +0 -0
  90. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/time_to_event/time_to_event_prediction.py +0 -0
  91. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/time_to_event/time_to_event_utils.py +0 -0
  92. {cehrgpt-0.0.1/tests → cehrgpt-0.1.0/src/cehrgpt/tools}/__init__.py +0 -0
  93. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/tools/ehrshot_benchmark.py +0 -0
  94. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/tools/generate_pretrained_embeddings.py +0 -0
  95. {cehrgpt-0.0.1/tests/integration_tests → cehrgpt-0.1.0/src/cehrgpt/tools/linear_prob}/__init__.py +0 -0
  96. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/tools/merge_synthetic_real_dataasets.py +0 -0
  97. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/tools/upload_omop_tables.py +0 -0
  98. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt.egg-info/dependency_links.txt +0 -0
  99. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt.egg-info/top_level.txt +0 -0
  100. {cehrgpt-0.0.1/tests/integration_tests/runners → cehrgpt-0.1.0/tests}/__init__.py +0 -0
  101. {cehrgpt-0.0.1/tests/unit_tests → cehrgpt-0.1.0/tests/integration_tests}/__init__.py +0 -0
  102. {cehrgpt-0.0.1/tests/unit_tests/models → cehrgpt-0.1.0/tests/integration_tests/runners}/__init__.py +0 -0
  103. {cehrgpt-0.0.1/tests/unit_tests/models/tokenization → cehrgpt-0.1.0/tests/unit_tests}/__init__.py +0 -0
  104. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/tests/unit_tests/gpt_utils_test.py +0 -0
  105. {cehrgpt-0.0.1/tests/unit_tests/runners → cehrgpt-0.1.0/tests/unit_tests/models}/__init__.py +0 -0
  106. {cehrgpt-0.0.1/tests/unit_tests/tools → cehrgpt-0.1.0/tests/unit_tests/models/tokenization}/__init__.py +0 -0
  107. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/tests/unit_tests/models/tokenization/create_bins_with_spline_test.py +0 -0
  108. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/tests/unit_tests/models/tokenization/create_sample_from_bins_test.py +0 -0
  109. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/tests/unit_tests/numeric_concept_statistics_test.py +0 -0
  110. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/tests/unit_tests/tokenization_test.py +0 -0
  111. {cehrgpt-0.0.1 → cehrgpt-0.1.0}/tests/unit_tests/tools/upload_omop_tables_test.py +0 -0
@@ -0,0 +1,28 @@
1
+ .DS_Store
2
+ .idea/
3
+ .vscode/
4
+ venv*
5
+ dist/*
6
+
7
+ *png
8
+ *json
9
+
10
+ *ipynb_checkpoints/
11
+ *h5
12
+ *logs
13
+ *nohup.out
14
+ *ipynb
15
+
16
+ *__pycache__/
17
+ .eggs/
18
+ *.dat
19
+ .metastore_db/
20
+
21
+ build/
22
+
23
+ *.out
24
+ *.egg-info/
25
+
26
+ test_data
27
+ test_dataset_prepared
28
+ test*results
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: cehrgpt
3
- Version: 0.0.1
3
+ Version: 0.1.0
4
4
  Summary: CEHR-GPT: Generating Electronic Health Records with Chronological Patient Timelines
5
5
  Author-email: Chao Pang <chaopang229@gmail.com>, Xinzhuo Jiang <xj2193@cumc.columbia.edu>, Krishna Kalluri <kk3326@cumc.columbia.edu>, Elise Minto <em3697@cumc.columbia.edu>, Jason Patterson <jp3477@cumc.columbia.edu>, Nishanth Parameshwar Pavinkurve <np2689@cumc.columbia.edu>, Karthik Natarajan <kn2174@cumc.columbia.edu>
6
6
  License: MIT License
@@ -12,12 +12,14 @@ Classifier: Programming Language :: Python :: 3
12
12
  Requires-Python: >=3.10.0
13
13
  Description-Content-Type: text/markdown
14
14
  License-File: LICENSE
15
- Requires-Dist: cehrbert==1.3.3
15
+ Requires-Dist: cehrbert==1.4.1
16
+ Requires-Dist: cehrbert_data==0.0.7
16
17
  Requires-Dist: openai==1.54.3
17
18
  Requires-Dist: optuna==4.0.0
18
- Requires-Dist: transformers==4.40.0
19
- Requires-Dist: tokenizers==0.19
20
- Requires-Dist: trl==0.11.4
19
+ Requires-Dist: transformers==4.44.0
20
+ Requires-Dist: tokenizers==0.19.0
21
+ Requires-Dist: peft==0.10.0
22
+ Requires-Dist: lightgbm
21
23
  Provides-Extra: dev
22
24
  Requires-Dist: pre-commit; extra == "dev"
23
25
  Requires-Dist: pytest; extra == "dev"
@@ -28,6 +30,7 @@ Requires-Dist: hypothesis; extra == "dev"
28
30
  Requires-Dist: black; extra == "dev"
29
31
  Provides-Extra: flash-attn
30
32
  Requires-Dist: flash_attn; extra == "flash-attn"
33
+ Dynamic: license-file
31
34
 
32
35
  # CEHRGPT
33
36
 
@@ -50,11 +53,57 @@ CEHRGPT is a synthetic data generation model developed to handle structured elec
50
53
  To install CEHRGPT, clone this repository and install the required dependencies.
51
54
 
52
55
  ```bash
53
- git clone https://github.com/knatarajan-lab/cehrgpt-public.git
54
- cd cehrgpt-public
56
+ git clone https://github.com/knatarajan-lab/cehrgpt.git
57
+ cd cehrgpt
55
58
  pip install .
56
59
  ```
57
60
 
61
+ ## Pretrain
62
+ Pretrain cehrgpt using the Hugging Face trainer, the parameters can be found in the sample configuration yaml
63
+ ```bash
64
+ mkdir test_results
65
+ # This is NOT required when streaming is set to true
66
+ mkdir test_dataset_prepared
67
+ python -u -m cehrgpt.runners.hf_cehrgpt_pretrain_runner sample_configs/cehrgpt_pretrain_sample_config.yaml
68
+ ```
69
+
70
+ ## Generate synthetic sequences
71
+ Generate synthetic sequences using the trained model
72
+ ```bash
73
+ export TRANSFORMERS_VERBOSITY=info
74
+ export CUDA_VISIBLE_DEVICES="0"
75
+ python -u -m cehrgpt.generation.generate_batch_hf_gpt_sequence \
76
+ --model_folder test_results \
77
+ --tokenizer_folder test_results \
78
+ --output_folder test_results \
79
+ --num_of_patients 128 \
80
+ --batch_size 32 \
81
+ --buffer_size 128 \
82
+ --context_window 1024 \
83
+ --sampling_strategy TopPStrategy \
84
+ --top_p 1.0 --temperature 1.0 --repetition_penalty 1.0 \
85
+ --epsilon_cutoff 0.00 \
86
+ --demographic_data_path sample_data/pretrain
87
+ ```
88
+
89
+ ## Convert synthetic sequences to OMOP
90
+ ```bash
91
+ # omop converter requires the OHDSI vocabulary
92
+ export OMOP_VOCAB_DIR = ""
93
+ # the omop derived tables need to be built using pyspark
94
+ export SPARK_WORKER_INSTANCES="1"
95
+ export SPARK_WORKER_CORES="8"
96
+ export SPARK_EXECUTOR_CORES="2"
97
+ export SPARK_DRIVER_MEMORY="2g"
98
+ export SPARK_EXECUTOR_MEMORY="2g"
99
+
100
+ # Convert the sequences, create the omop derived tables
101
+ sh scripts/omop_pipeline.sh \
102
+ test_results/top_p10000/generated_sequences/ \
103
+ test_results/top_p10000/restored_omop/ \
104
+ $OMOP_VOCAB_DIR
105
+ ```
106
+
58
107
  ## Citation
59
108
  ```
60
109
  @article{cehrgpt2024,
@@ -63,4 +112,3 @@ pip install .
63
112
  journal={arXiv preprint arXiv:2402.04400},
64
113
  year={2024}
65
114
  }
66
- ```
@@ -19,11 +19,57 @@ CEHRGPT is a synthetic data generation model developed to handle structured elec
19
19
  To install CEHRGPT, clone this repository and install the required dependencies.
20
20
 
21
21
  ```bash
22
- git clone https://github.com/knatarajan-lab/cehrgpt-public.git
23
- cd cehrgpt-public
22
+ git clone https://github.com/knatarajan-lab/cehrgpt.git
23
+ cd cehrgpt
24
24
  pip install .
25
25
  ```
26
26
 
27
+ ## Pretrain
28
+ Pretrain cehrgpt using the Hugging Face trainer, the parameters can be found in the sample configuration yaml
29
+ ```bash
30
+ mkdir test_results
31
+ # This is NOT required when streaming is set to true
32
+ mkdir test_dataset_prepared
33
+ python -u -m cehrgpt.runners.hf_cehrgpt_pretrain_runner sample_configs/cehrgpt_pretrain_sample_config.yaml
34
+ ```
35
+
36
+ ## Generate synthetic sequences
37
+ Generate synthetic sequences using the trained model
38
+ ```bash
39
+ export TRANSFORMERS_VERBOSITY=info
40
+ export CUDA_VISIBLE_DEVICES="0"
41
+ python -u -m cehrgpt.generation.generate_batch_hf_gpt_sequence \
42
+ --model_folder test_results \
43
+ --tokenizer_folder test_results \
44
+ --output_folder test_results \
45
+ --num_of_patients 128 \
46
+ --batch_size 32 \
47
+ --buffer_size 128 \
48
+ --context_window 1024 \
49
+ --sampling_strategy TopPStrategy \
50
+ --top_p 1.0 --temperature 1.0 --repetition_penalty 1.0 \
51
+ --epsilon_cutoff 0.00 \
52
+ --demographic_data_path sample_data/pretrain
53
+ ```
54
+
55
+ ## Convert synthetic sequences to OMOP
56
+ ```bash
57
+ # omop converter requires the OHDSI vocabulary
58
+ export OMOP_VOCAB_DIR = ""
59
+ # the omop derived tables need to be built using pyspark
60
+ export SPARK_WORKER_INSTANCES="1"
61
+ export SPARK_WORKER_CORES="8"
62
+ export SPARK_EXECUTOR_CORES="2"
63
+ export SPARK_DRIVER_MEMORY="2g"
64
+ export SPARK_EXECUTOR_MEMORY="2g"
65
+
66
+ # Convert the sequences, create the omop derived tables
67
+ sh scripts/omop_pipeline.sh \
68
+ test_results/top_p10000/generated_sequences/ \
69
+ test_results/top_p10000/restored_omop/ \
70
+ $OMOP_VOCAB_DIR
71
+ ```
72
+
27
73
  ## Citation
28
74
  ```
29
75
  @article{cehrgpt2024,
@@ -31,5 +77,4 @@ pip install .
31
77
  author={Natarajan, K and others},
32
78
  journal={arXiv preprint arXiv:2402.04400},
33
79
  year={2024}
34
- }
35
- ```
80
+ }
@@ -28,12 +28,14 @@ classifiers = [
28
28
  ]
29
29
 
30
30
  dependencies = [
31
- "cehrbert==1.3.3",
31
+ "cehrbert==1.4.1",
32
+ "cehrbert_data==0.0.7",
32
33
  "openai==1.54.3",
33
34
  "optuna==4.0.0",
34
- "transformers==4.40.0",
35
- "tokenizers==0.19",
36
- "trl==0.11.4",
35
+ "transformers==4.44.0",
36
+ "tokenizers==0.19.0",
37
+ "peft==0.10.0",
38
+ "lightgbm",
37
39
  ]
38
40
 
39
41
  [tool.setuptools_scm]
@@ -0,0 +1,51 @@
1
+ model_name_or_path: "test_results"
2
+ tokenizer_name_or_path: "test_results"
3
+
4
+ data_folder: "sample_data/pretrain"
5
+ dataset_prepared_path: "test_dataset_prepared"
6
+ validation_split_percentage: 0.05
7
+ validation_split_num: 10
8
+ preprocessing_num_workers: 4
9
+ preprocessing_batch_size: 1000
10
+ streaming: true
11
+
12
+ #Tokenizer
13
+ vocab_size: 50000
14
+ min_frequency: 0
15
+
16
+ do_train: true
17
+ overwrite_output_dir: false
18
+ resume_from_checkpoint: # path to the checkpoint folder
19
+ seed: 42
20
+
21
+ num_hidden_layers: 6
22
+ hidden_size: 768
23
+ n_head: 12
24
+ max_position_embeddings: 1024
25
+
26
+ # torch dataloader configs
27
+ dataloader_num_workers: 4
28
+ dataloader_prefetch_factor: 2
29
+
30
+ output_dir: "test_results"
31
+ save_strategy: "steps"
32
+ evaluation_strategy: "no"
33
+ learning_rate: 0.00005
34
+ per_device_train_batch_size: 4
35
+ per_device_eval_batch_size: 4
36
+ gradient_accumulation_steps: 1
37
+ num_train_epochs: 1
38
+ # When streaming is set to True, max_steps needs to be provided
39
+ max_steps: 1000
40
+ save_steps: 500
41
+
42
+ warmup_steps: 100
43
+ weight_decay: 0.01
44
+ logging_dir: "./logs"
45
+ logging_steps: 100
46
+ save_total_limit: 5
47
+ load_best_model_at_end: false
48
+ metric_for_best_model: "eval_loss"
49
+ greater_is_better: false
50
+
51
+ report_to: "none"
@@ -29,7 +29,8 @@ python -u -m cehrbert_data.prediction_cohorts.cad_cabg_cohort \
29
29
  -dl 1985-01-01 -du 2023-12-31 \
30
30
  -l 18 -u 100 -ow 360 -ps 0 -pw 360 -f \
31
31
  --att_type cehr_bert \
32
- --ehr_table_list condition_occurrence procedure_occurrence drug_exposure -iv
32
+ --ehr_table_list condition_occurrence procedure_occurrence drug_exposure -iv \
33
+ --is_remove_index_prediction_starts
33
34
 
34
35
  # Run Predictions on CAD CABG
35
36
  echo "Run predictions on cad_cabg"
@@ -56,9 +57,10 @@ python -u -m cehrbert_data.prediction_cohorts.hf_readmission \
56
57
  -c hf_readmission_bow \
57
58
  -i "$OMOP_FOLDER" \
58
59
  -o "$OMOP_FOLDER/cohorts/hf_readmission" \
59
- -dl 1985-01-01 -du 2023-12-31 -l 18 -u 100 -ow 360 -ps 0 -pw 30 -f \
60
+ -dl 1985-01-01 -du 2023-12-31 -l 18 -u 100 -ow 360 -ps 1 -pw 30 -f \
60
61
  --att_type cehr_bert \
61
- --ehr_table_list condition_occurrence procedure_occurrence drug_exposure -iv
62
+ --ehr_table_list condition_occurrence procedure_occurrence drug_exposure -iv \
63
+ --is_remove_index_prediction_starts
62
64
 
63
65
  # Run predictions on HF Readmission
64
66
  echo "Run predictions on hf_readmission"
@@ -85,9 +87,10 @@ python -u -m cehrbert_data.prediction_cohorts.copd_readmission \
85
87
  -c copd_readmission_bow \
86
88
  -i "$OMOP_FOLDER" \
87
89
  -o "$OMOP_FOLDER/cohorts/copd_readmission" \
88
- -dl 1985-01-01 -du 2023-12-31 -l 18 -u 100 -ow 720 -ps 0 -pw 360 -f \
90
+ -dl 1985-01-01 -du 2023-12-31 -l 18 -u 100 -ow 360 -ps 1 -pw 30 -f \
89
91
  --att_type cehr_bert \
90
- --ehr_table_list condition_occurrence procedure_occurrence drug_exposure -iv
92
+ --ehr_table_list condition_occurrence procedure_occurrence drug_exposure -iv \
93
+ --is_remove_index_prediction_starts
91
94
 
92
95
  # Run predictions on COPD Readmission
93
96
  echo "Run predictions on copd_readmission"
@@ -145,7 +148,8 @@ python -u -m cehrbert_data.prediction_cohorts.afib_ischemic_stroke \
145
148
  -o "$OMOP_FOLDER/cohorts/afib_ischemic_stroke" \
146
149
  -dl 1985-01-01 -du 2023-12-31 -l 18 -u 100 -ow 720 -ps 0 -pw 360 -f \
147
150
  --att_type cehr_bert \
148
- --ehr_table_list condition_occurrence procedure_occurrence drug_exposure -iv
151
+ --ehr_table_list condition_occurrence procedure_occurrence drug_exposure -iv \
152
+ --is_remove_index_prediction_starts
149
153
 
150
154
  # Run predictions on AFIB Ischemic Stroke
151
155
  echo "Run predictions on afib_ischemic_stroke"
@@ -0,0 +1,55 @@
1
+ #!/bin/bash
2
+
3
+ # Exporting input arguments as environment variables
4
+ export PATIENT_SEQUENCE_FOLDER="$1"
5
+ export OMOP_FOLDER="$2"
6
+ export SOURCE_OMOP_FOLDER="$3"
7
+ export PATIENT_SPLITS_FOLDER="$SOURCE_OMOP_FOLDER/patient_splits"
8
+
9
+ # Echoing the values of the environment variables
10
+ echo "PATIENT_SEQUENCE_FOLDER=$PATIENT_SEQUENCE_FOLDER"
11
+ echo "OMOP_FOLDER=$OMOP_FOLDER"
12
+ echo "SOURCE_OMOP_FOLDER=$SOURCE_OMOP_FOLDER"
13
+
14
+ # Ensure OMOP_FOLDER exists
15
+ if [ ! -d "$OMOP_FOLDER" ]; then
16
+ echo "Creating $OMOP_FOLDER"
17
+ mkdir -p "$OMOP_FOLDER"
18
+ fi
19
+
20
+ # Removing existing OMOP tables
21
+ rm -rf $OMOP_FOLDER/{person,visit_occurrence,condition_occurrence,procedure_occurrence,drug_exposure,death,measurement,observation_period,condition_era}
22
+
23
+ # Removing existing OMOP concept tables
24
+ rm -rf $OMOP_FOLDER/{concept,concept_ancestor,concept_relationship}
25
+
26
+ # Copying OMOP concept tables if they don't already exist
27
+ for table in concept concept_relationship concept_ancestor; do
28
+ if [ ! -d "$OMOP_FOLDER/$table" ]; then
29
+ echo "Creating $OMOP_FOLDER/$table"
30
+ cp -r "$SOURCE_OMOP_FOLDER/$table" "$OMOP_FOLDER/$table"
31
+ fi
32
+ done
33
+
34
+ # Reconstructing the OMOP instance from patient sequences
35
+ echo "Reconstructing the OMOP instance from patient sequences in $OMOP_FOLDER"
36
+ python -m cehrgpt.generation.omop_converter_batch \
37
+ --patient_sequence_path "$PATIENT_SEQUENCE_FOLDER" \
38
+ --output_folder "$OMOP_FOLDER" \
39
+ --concept_path "$OMOP_FOLDER/concept" \
40
+ --buffer_size 1280 \
41
+ --cpu_cores 10
42
+
43
+ # Create observation_period
44
+ echo "Reconstructing observation_period in $OMOP_FOLDER"
45
+ python -u -m cehrgpt.omop.observation_period \
46
+ --input_folder "$OMOP_FOLDER" \
47
+ --output_folder "$OMOP_FOLDER" \
48
+ --domain_table_list "condition_occurrence drug_exposure procedure_occurrence measurement"
49
+
50
+ # Create condition_era
51
+ echo "Reconstructing condition_era in $OMOP_FOLDER"
52
+ python -u -m cehrgpt.omop.condition_era \
53
+ --input_folder "$OMOP_FOLDER" \
54
+ --output_folder "$OMOP_FOLDER" \
55
+ --domain_table_list "condition_occurrence"
@@ -1,9 +1,10 @@
1
- from typing import Union
1
+ from typing import Optional, Union
2
2
 
3
3
  from cehrbert.data_generators.hf_data_generator.hf_dataset import (
4
4
  FINETUNING_COLUMNS,
5
5
  apply_cehrbert_dataset_mapping,
6
6
  )
7
+ from cehrbert.data_generators.hf_data_generator.meds_utils import CacheFileCollector
7
8
  from cehrbert.runners.hf_runner_argument_dataclass import DataTrainingArguments
8
9
  from datasets import Dataset, DatasetDict
9
10
 
@@ -31,16 +32,25 @@ def create_cehrgpt_pretraining_dataset(
31
32
  dataset: Union[Dataset, DatasetDict],
32
33
  cehrgpt_tokenizer: CehrGptTokenizer,
33
34
  data_args: DataTrainingArguments,
34
- ) -> Dataset:
35
+ cache_file_collector: Optional[CacheFileCollector] = None,
36
+ ) -> Union[Dataset, DatasetDict]:
35
37
  required_columns = TRANSFORMER_COLUMNS + CEHRGPT_COLUMNS
38
+ # TODO: temp solution, this column is mixed typed and causes an issue when transforming the data
39
+ if not data_args.streaming:
40
+ if isinstance(dataset, DatasetDict):
41
+ all_columns = dataset["train"].column_names
42
+ else:
43
+ all_columns = dataset.column_names
44
+ if "visit_concept_ids" in all_columns:
45
+ dataset.remove_columns(["visit_concept_ids"])
36
46
  dataset = apply_cehrbert_dataset_mapping(
37
47
  dataset,
38
48
  HFCehrGptTokenizationMapping(cehrgpt_tokenizer),
39
49
  num_proc=data_args.preprocessing_num_workers,
40
50
  batch_size=data_args.preprocessing_batch_size,
41
51
  streaming=data_args.streaming,
52
+ cache_file_collector=cache_file_collector,
42
53
  )
43
-
44
54
  if not data_args.streaming:
45
55
  if isinstance(dataset, DatasetDict):
46
56
  all_columns = dataset["train"].column_names
@@ -56,8 +66,17 @@ def create_cehrgpt_finetuning_dataset(
56
66
  dataset: Union[Dataset, DatasetDict],
57
67
  cehrgpt_tokenizer: CehrGptTokenizer,
58
68
  data_args: DataTrainingArguments,
59
- ) -> Dataset:
69
+ cache_file_collector: Optional[CacheFileCollector] = None,
70
+ ) -> Union[Dataset, DatasetDict]:
60
71
  required_columns = TRANSFORMER_COLUMNS + CEHRGPT_COLUMNS + FINETUNING_COLUMNS
72
+ # TODO: temp solution, this column is mixed typed and causes an issue when transforming the data
73
+ if not data_args.streaming:
74
+ if isinstance(dataset, DatasetDict):
75
+ all_columns = dataset["train"].column_names
76
+ else:
77
+ all_columns = dataset.column_names
78
+ if "visit_concept_ids" in all_columns:
79
+ dataset.remove_columns(["visit_concept_ids"])
61
80
  mapping_functions = [
62
81
  HFFineTuningMapping(cehrgpt_tokenizer),
63
82
  ]
@@ -68,6 +87,7 @@ def create_cehrgpt_finetuning_dataset(
68
87
  num_proc=data_args.preprocessing_num_workers,
69
88
  batch_size=data_args.preprocessing_batch_size,
70
89
  streaming=data_args.streaming,
90
+ cache_file_collector=cache_file_collector,
71
91
  )
72
92
 
73
93
  if not data_args.streaming: