cehrgpt 0.0.1__tar.gz → 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cehrgpt-0.1.0/.gitignore +28 -0
- {cehrgpt-0.0.1/src/cehrgpt.egg-info → cehrgpt-0.1.0}/PKG-INFO +57 -9
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/README.md +49 -4
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/pyproject.toml +6 -4
- cehrgpt-0.1.0/sample_configs/cehrgpt_pretrain_sample_config.yaml +51 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/scripts/level_three_evaluation.sh +10 -6
- cehrgpt-0.1.0/scripts/omop_pipeline.sh +55 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/data/hf_cehrgpt_dataset.py +24 -4
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/data/hf_cehrgpt_dataset_collator.py +260 -84
- cehrgpt-0.1.0/src/cehrgpt/data/hf_cehrgpt_dataset_mapping.py +393 -0
- cehrgpt-0.1.0/src/cehrgpt/data/sample_packing_sampler.py +151 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/generation/generate_batch_hf_gpt_sequence.py +12 -9
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/generation/omop_converter_batch.py +3 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/models/config.py +10 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/models/hf_cehrgpt.py +244 -73
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/models/tokenization_hf_cehrgpt.py +6 -2
- cehrgpt-0.1.0/src/cehrgpt/runners/data_utils.py +243 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/runners/gpt_runner_util.py +0 -10
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/runners/hf_cehrgpt_finetune_runner.py +154 -260
- cehrgpt-0.1.0/src/cehrgpt/runners/hf_cehrgpt_pretrain_runner.py +530 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/runners/hf_gpt_runner_argument_dataclass.py +46 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/runners/hyperparameter_search_util.py +4 -1
- cehrgpt-0.1.0/src/cehrgpt/runners/sample_packing_trainer.py +168 -0
- cehrgpt-0.1.0/src/cehrgpt/simulations/generate_plots.py +95 -0
- cehrgpt-0.1.0/src/cehrgpt/simulations/run_simulation.sh +24 -0
- cehrgpt-0.1.0/src/cehrgpt/simulations/time_embedding_simulation.py +250 -0
- cehrgpt-0.1.0/src/cehrgpt/simulations/time_token_simulation.py +177 -0
- cehrgpt-0.1.0/src/cehrgpt/tools/generate_causal_patient_split_by_age.py +146 -0
- cehrgpt-0.1.0/src/cehrgpt/tools/linear_prob/compute_cehrgpt_features.py +467 -0
- cehrgpt-0.1.0/src/cehrgpt/tools/linear_prob/train_with_cehrgpt_features.py +152 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0/src/cehrgpt.egg-info}/PKG-INFO +57 -9
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt.egg-info/SOURCES.txt +15 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt.egg-info/requires.txt +6 -4
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/tests/integration_tests/runners/hf_cehrgpt_pretrain_runner_test.py +15 -5
- cehrgpt-0.1.0/tests/integration_tests/runners/hf_cehrgpt_pretrain_sample_packing_runner_test.py +115 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/tests/integration_tests/runners/hf_cehrgpt_pretrain_sfm_runner_test.py +9 -3
- cehrgpt-0.1.0/tests/unit_tests/models/model_utils_test.py +131 -0
- cehrgpt-0.1.0/tests/unit_tests/runners/__init__.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/tests/unit_tests/runners/hf_cehrgpt_finetune_runner_test.py +4 -4
- cehrgpt-0.1.0/tests/unit_tests/tools/__init__.py +0 -0
- cehrgpt-0.0.1/.gitignore +0 -38
- cehrgpt-0.0.1/scripts/omop_pipeline.sh +0 -73
- cehrgpt-0.0.1/src/cehrgpt/data/hf_cehrgpt_dataset_mapping.py +0 -116
- cehrgpt-0.0.1/src/cehrgpt/runners/hf_cehrgpt_pretrain_runner.py +0 -370
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/.github/workflows/build-python.yaml +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/.github/workflows/tests.yaml +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/.pre-commit-config.yaml +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/LICENSE +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/sample_data/pretrain/patient_sequence.parquet +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/sample_data/pretrained_embeddings/pretrained_embedding_concepts.pkl +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/sample_data/pretrained_embeddings/pretrained_embedding_vectors.npy +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/scripts/pool_generated_sequences.sh +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/setup.cfg +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/__init__.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/__init__.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/analysis/__init__.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/analysis/privacy/__init__.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/analysis/privacy/attribute_inference.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/analysis/privacy/attribute_inference_config.yml +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/analysis/privacy/member_inference.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/analysis/privacy/nearest_neighbor_inference.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/analysis/privacy/reid_inference.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/analysis/privacy/utils.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/cehrgpt_args.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/data/__init__.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/generation/__init__.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/generation/chatgpt_generation.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/generation/omop_entity.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/gpt_utils.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/models/__init__.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/models/hf_modeling_outputs.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/models/pretrained_embeddings.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/models/special_tokens.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/omop/__init__.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/omop/condition_era.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/omop/observation_period.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/omop/omop_argparse.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/omop/omop_table_builder.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/omop/queries/__init__.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/omop/queries/condition_era.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/omop/queries/observation_period.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/omop/sample_omop_tables.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/runners/__init__.py +0 -0
- {cehrgpt-0.0.1/src/cehrgpt/time_to_event → cehrgpt-0.1.0/src/cehrgpt/simulations}/__init__.py +0 -0
- {cehrgpt-0.0.1/src/cehrgpt/tools → cehrgpt-0.1.0/src/cehrgpt/time_to_event}/__init__.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/time_to_event/config/30_day_readmission.yaml +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/time_to_event/config/next_visit_type_prediction.yaml +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/time_to_event/config/t2dm_hf.yaml +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/time_to_event/time_to_event_model.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/time_to_event/time_to_event_prediction.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/time_to_event/time_to_event_utils.py +0 -0
- {cehrgpt-0.0.1/tests → cehrgpt-0.1.0/src/cehrgpt/tools}/__init__.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/tools/ehrshot_benchmark.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/tools/generate_pretrained_embeddings.py +0 -0
- {cehrgpt-0.0.1/tests/integration_tests → cehrgpt-0.1.0/src/cehrgpt/tools/linear_prob}/__init__.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/tools/merge_synthetic_real_dataasets.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt/tools/upload_omop_tables.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt.egg-info/dependency_links.txt +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/src/cehrgpt.egg-info/top_level.txt +0 -0
- {cehrgpt-0.0.1/tests/integration_tests/runners → cehrgpt-0.1.0/tests}/__init__.py +0 -0
- {cehrgpt-0.0.1/tests/unit_tests → cehrgpt-0.1.0/tests/integration_tests}/__init__.py +0 -0
- {cehrgpt-0.0.1/tests/unit_tests/models → cehrgpt-0.1.0/tests/integration_tests/runners}/__init__.py +0 -0
- {cehrgpt-0.0.1/tests/unit_tests/models/tokenization → cehrgpt-0.1.0/tests/unit_tests}/__init__.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/tests/unit_tests/gpt_utils_test.py +0 -0
- {cehrgpt-0.0.1/tests/unit_tests/runners → cehrgpt-0.1.0/tests/unit_tests/models}/__init__.py +0 -0
- {cehrgpt-0.0.1/tests/unit_tests/tools → cehrgpt-0.1.0/tests/unit_tests/models/tokenization}/__init__.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/tests/unit_tests/models/tokenization/create_bins_with_spline_test.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/tests/unit_tests/models/tokenization/create_sample_from_bins_test.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/tests/unit_tests/numeric_concept_statistics_test.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/tests/unit_tests/tokenization_test.py +0 -0
- {cehrgpt-0.0.1 → cehrgpt-0.1.0}/tests/unit_tests/tools/upload_omop_tables_test.py +0 -0
cehrgpt-0.1.0/.gitignore
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
.DS_Store
|
2
|
+
.idea/
|
3
|
+
.vscode/
|
4
|
+
venv*
|
5
|
+
dist/*
|
6
|
+
|
7
|
+
*png
|
8
|
+
*json
|
9
|
+
|
10
|
+
*ipynb_checkpoints/
|
11
|
+
*h5
|
12
|
+
*logs
|
13
|
+
*nohup.out
|
14
|
+
*ipynb
|
15
|
+
|
16
|
+
*__pycache__/
|
17
|
+
.eggs/
|
18
|
+
*.dat
|
19
|
+
.metastore_db/
|
20
|
+
|
21
|
+
build/
|
22
|
+
|
23
|
+
*.out
|
24
|
+
*.egg-info/
|
25
|
+
|
26
|
+
test_data
|
27
|
+
test_dataset_prepared
|
28
|
+
test*results
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: cehrgpt
|
3
|
-
Version: 0.0
|
3
|
+
Version: 0.1.0
|
4
4
|
Summary: CEHR-GPT: Generating Electronic Health Records with Chronological Patient Timelines
|
5
5
|
Author-email: Chao Pang <chaopang229@gmail.com>, Xinzhuo Jiang <xj2193@cumc.columbia.edu>, Krishna Kalluri <kk3326@cumc.columbia.edu>, Elise Minto <em3697@cumc.columbia.edu>, Jason Patterson <jp3477@cumc.columbia.edu>, Nishanth Parameshwar Pavinkurve <np2689@cumc.columbia.edu>, Karthik Natarajan <kn2174@cumc.columbia.edu>
|
6
6
|
License: MIT License
|
@@ -12,12 +12,14 @@ Classifier: Programming Language :: Python :: 3
|
|
12
12
|
Requires-Python: >=3.10.0
|
13
13
|
Description-Content-Type: text/markdown
|
14
14
|
License-File: LICENSE
|
15
|
-
Requires-Dist: cehrbert==1.
|
15
|
+
Requires-Dist: cehrbert==1.4.1
|
16
|
+
Requires-Dist: cehrbert_data==0.0.7
|
16
17
|
Requires-Dist: openai==1.54.3
|
17
18
|
Requires-Dist: optuna==4.0.0
|
18
|
-
Requires-Dist: transformers==4.
|
19
|
-
Requires-Dist: tokenizers==0.19
|
20
|
-
Requires-Dist:
|
19
|
+
Requires-Dist: transformers==4.44.0
|
20
|
+
Requires-Dist: tokenizers==0.19.0
|
21
|
+
Requires-Dist: peft==0.10.0
|
22
|
+
Requires-Dist: lightgbm
|
21
23
|
Provides-Extra: dev
|
22
24
|
Requires-Dist: pre-commit; extra == "dev"
|
23
25
|
Requires-Dist: pytest; extra == "dev"
|
@@ -28,6 +30,7 @@ Requires-Dist: hypothesis; extra == "dev"
|
|
28
30
|
Requires-Dist: black; extra == "dev"
|
29
31
|
Provides-Extra: flash-attn
|
30
32
|
Requires-Dist: flash_attn; extra == "flash-attn"
|
33
|
+
Dynamic: license-file
|
31
34
|
|
32
35
|
# CEHRGPT
|
33
36
|
|
@@ -50,11 +53,57 @@ CEHRGPT is a synthetic data generation model developed to handle structured elec
|
|
50
53
|
To install CEHRGPT, clone this repository and install the required dependencies.
|
51
54
|
|
52
55
|
```bash
|
53
|
-
git clone https://github.com/knatarajan-lab/cehrgpt
|
54
|
-
cd cehrgpt
|
56
|
+
git clone https://github.com/knatarajan-lab/cehrgpt.git
|
57
|
+
cd cehrgpt
|
55
58
|
pip install .
|
56
59
|
```
|
57
60
|
|
61
|
+
## Pretrain
|
62
|
+
Pretrain cehrgpt using the Hugging Face trainer, the parameters can be found in the sample configuration yaml
|
63
|
+
```bash
|
64
|
+
mkdir test_results
|
65
|
+
# This is NOT required when streaming is set to true
|
66
|
+
mkdir test_dataset_prepared
|
67
|
+
python -u -m cehrgpt.runners.hf_cehrgpt_pretrain_runner sample_configs/cehrgpt_pretrain_sample_config.yaml
|
68
|
+
```
|
69
|
+
|
70
|
+
## Generate synthetic sequences
|
71
|
+
Generate synthetic sequences using the trained model
|
72
|
+
```bash
|
73
|
+
export TRANSFORMERS_VERBOSITY=info
|
74
|
+
export CUDA_VISIBLE_DEVICES="0"
|
75
|
+
python -u -m cehrgpt.generation.generate_batch_hf_gpt_sequence \
|
76
|
+
--model_folder test_results \
|
77
|
+
--tokenizer_folder test_results \
|
78
|
+
--output_folder test_results \
|
79
|
+
--num_of_patients 128 \
|
80
|
+
--batch_size 32 \
|
81
|
+
--buffer_size 128 \
|
82
|
+
--context_window 1024 \
|
83
|
+
--sampling_strategy TopPStrategy \
|
84
|
+
--top_p 1.0 --temperature 1.0 --repetition_penalty 1.0 \
|
85
|
+
--epsilon_cutoff 0.00 \
|
86
|
+
--demographic_data_path sample_data/pretrain
|
87
|
+
```
|
88
|
+
|
89
|
+
## Convert synthetic sequences to OMOP
|
90
|
+
```bash
|
91
|
+
# omop converter requires the OHDSI vocabulary
|
92
|
+
export OMOP_VOCAB_DIR = ""
|
93
|
+
# the omop derived tables need to be built using pyspark
|
94
|
+
export SPARK_WORKER_INSTANCES="1"
|
95
|
+
export SPARK_WORKER_CORES="8"
|
96
|
+
export SPARK_EXECUTOR_CORES="2"
|
97
|
+
export SPARK_DRIVER_MEMORY="2g"
|
98
|
+
export SPARK_EXECUTOR_MEMORY="2g"
|
99
|
+
|
100
|
+
# Convert the sequences, create the omop derived tables
|
101
|
+
sh scripts/omop_pipeline.sh \
|
102
|
+
test_results/top_p10000/generated_sequences/ \
|
103
|
+
test_results/top_p10000/restored_omop/ \
|
104
|
+
$OMOP_VOCAB_DIR
|
105
|
+
```
|
106
|
+
|
58
107
|
## Citation
|
59
108
|
```
|
60
109
|
@article{cehrgpt2024,
|
@@ -63,4 +112,3 @@ pip install .
|
|
63
112
|
journal={arXiv preprint arXiv:2402.04400},
|
64
113
|
year={2024}
|
65
114
|
}
|
66
|
-
```
|
@@ -19,11 +19,57 @@ CEHRGPT is a synthetic data generation model developed to handle structured elec
|
|
19
19
|
To install CEHRGPT, clone this repository and install the required dependencies.
|
20
20
|
|
21
21
|
```bash
|
22
|
-
git clone https://github.com/knatarajan-lab/cehrgpt
|
23
|
-
cd cehrgpt
|
22
|
+
git clone https://github.com/knatarajan-lab/cehrgpt.git
|
23
|
+
cd cehrgpt
|
24
24
|
pip install .
|
25
25
|
```
|
26
26
|
|
27
|
+
## Pretrain
|
28
|
+
Pretrain cehrgpt using the Hugging Face trainer, the parameters can be found in the sample configuration yaml
|
29
|
+
```bash
|
30
|
+
mkdir test_results
|
31
|
+
# This is NOT required when streaming is set to true
|
32
|
+
mkdir test_dataset_prepared
|
33
|
+
python -u -m cehrgpt.runners.hf_cehrgpt_pretrain_runner sample_configs/cehrgpt_pretrain_sample_config.yaml
|
34
|
+
```
|
35
|
+
|
36
|
+
## Generate synthetic sequences
|
37
|
+
Generate synthetic sequences using the trained model
|
38
|
+
```bash
|
39
|
+
export TRANSFORMERS_VERBOSITY=info
|
40
|
+
export CUDA_VISIBLE_DEVICES="0"
|
41
|
+
python -u -m cehrgpt.generation.generate_batch_hf_gpt_sequence \
|
42
|
+
--model_folder test_results \
|
43
|
+
--tokenizer_folder test_results \
|
44
|
+
--output_folder test_results \
|
45
|
+
--num_of_patients 128 \
|
46
|
+
--batch_size 32 \
|
47
|
+
--buffer_size 128 \
|
48
|
+
--context_window 1024 \
|
49
|
+
--sampling_strategy TopPStrategy \
|
50
|
+
--top_p 1.0 --temperature 1.0 --repetition_penalty 1.0 \
|
51
|
+
--epsilon_cutoff 0.00 \
|
52
|
+
--demographic_data_path sample_data/pretrain
|
53
|
+
```
|
54
|
+
|
55
|
+
## Convert synthetic sequences to OMOP
|
56
|
+
```bash
|
57
|
+
# omop converter requires the OHDSI vocabulary
|
58
|
+
export OMOP_VOCAB_DIR = ""
|
59
|
+
# the omop derived tables need to be built using pyspark
|
60
|
+
export SPARK_WORKER_INSTANCES="1"
|
61
|
+
export SPARK_WORKER_CORES="8"
|
62
|
+
export SPARK_EXECUTOR_CORES="2"
|
63
|
+
export SPARK_DRIVER_MEMORY="2g"
|
64
|
+
export SPARK_EXECUTOR_MEMORY="2g"
|
65
|
+
|
66
|
+
# Convert the sequences, create the omop derived tables
|
67
|
+
sh scripts/omop_pipeline.sh \
|
68
|
+
test_results/top_p10000/generated_sequences/ \
|
69
|
+
test_results/top_p10000/restored_omop/ \
|
70
|
+
$OMOP_VOCAB_DIR
|
71
|
+
```
|
72
|
+
|
27
73
|
## Citation
|
28
74
|
```
|
29
75
|
@article{cehrgpt2024,
|
@@ -31,5 +77,4 @@ pip install .
|
|
31
77
|
author={Natarajan, K and others},
|
32
78
|
journal={arXiv preprint arXiv:2402.04400},
|
33
79
|
year={2024}
|
34
|
-
}
|
35
|
-
```
|
80
|
+
}
|
@@ -28,12 +28,14 @@ classifiers = [
|
|
28
28
|
]
|
29
29
|
|
30
30
|
dependencies = [
|
31
|
-
"cehrbert==1.
|
31
|
+
"cehrbert==1.4.1",
|
32
|
+
"cehrbert_data==0.0.7",
|
32
33
|
"openai==1.54.3",
|
33
34
|
"optuna==4.0.0",
|
34
|
-
"transformers==4.
|
35
|
-
"tokenizers==0.19",
|
36
|
-
"
|
35
|
+
"transformers==4.44.0",
|
36
|
+
"tokenizers==0.19.0",
|
37
|
+
"peft==0.10.0",
|
38
|
+
"lightgbm",
|
37
39
|
]
|
38
40
|
|
39
41
|
[tool.setuptools_scm]
|
@@ -0,0 +1,51 @@
|
|
1
|
+
model_name_or_path: "test_results"
|
2
|
+
tokenizer_name_or_path: "test_results"
|
3
|
+
|
4
|
+
data_folder: "sample_data/pretrain"
|
5
|
+
dataset_prepared_path: "test_dataset_prepared"
|
6
|
+
validation_split_percentage: 0.05
|
7
|
+
validation_split_num: 10
|
8
|
+
preprocessing_num_workers: 4
|
9
|
+
preprocessing_batch_size: 1000
|
10
|
+
streaming: true
|
11
|
+
|
12
|
+
#Tokenizer
|
13
|
+
vocab_size: 50000
|
14
|
+
min_frequency: 0
|
15
|
+
|
16
|
+
do_train: true
|
17
|
+
overwrite_output_dir: false
|
18
|
+
resume_from_checkpoint: # path to the checkpoint folder
|
19
|
+
seed: 42
|
20
|
+
|
21
|
+
num_hidden_layers: 6
|
22
|
+
hidden_size: 768
|
23
|
+
n_head: 12
|
24
|
+
max_position_embeddings: 1024
|
25
|
+
|
26
|
+
# torch dataloader configs
|
27
|
+
dataloader_num_workers: 4
|
28
|
+
dataloader_prefetch_factor: 2
|
29
|
+
|
30
|
+
output_dir: "test_results"
|
31
|
+
save_strategy: "steps"
|
32
|
+
evaluation_strategy: "no"
|
33
|
+
learning_rate: 0.00005
|
34
|
+
per_device_train_batch_size: 4
|
35
|
+
per_device_eval_batch_size: 4
|
36
|
+
gradient_accumulation_steps: 1
|
37
|
+
num_train_epochs: 1
|
38
|
+
# When streaming is set to True, max_steps needs to be provided
|
39
|
+
max_steps: 1000
|
40
|
+
save_steps: 500
|
41
|
+
|
42
|
+
warmup_steps: 100
|
43
|
+
weight_decay: 0.01
|
44
|
+
logging_dir: "./logs"
|
45
|
+
logging_steps: 100
|
46
|
+
save_total_limit: 5
|
47
|
+
load_best_model_at_end: false
|
48
|
+
metric_for_best_model: "eval_loss"
|
49
|
+
greater_is_better: false
|
50
|
+
|
51
|
+
report_to: "none"
|
@@ -29,7 +29,8 @@ python -u -m cehrbert_data.prediction_cohorts.cad_cabg_cohort \
|
|
29
29
|
-dl 1985-01-01 -du 2023-12-31 \
|
30
30
|
-l 18 -u 100 -ow 360 -ps 0 -pw 360 -f \
|
31
31
|
--att_type cehr_bert \
|
32
|
-
--ehr_table_list condition_occurrence procedure_occurrence drug_exposure -iv
|
32
|
+
--ehr_table_list condition_occurrence procedure_occurrence drug_exposure -iv \
|
33
|
+
--is_remove_index_prediction_starts
|
33
34
|
|
34
35
|
# Run Predictions on CAD CABG
|
35
36
|
echo "Run predictions on cad_cabg"
|
@@ -56,9 +57,10 @@ python -u -m cehrbert_data.prediction_cohorts.hf_readmission \
|
|
56
57
|
-c hf_readmission_bow \
|
57
58
|
-i "$OMOP_FOLDER" \
|
58
59
|
-o "$OMOP_FOLDER/cohorts/hf_readmission" \
|
59
|
-
-dl 1985-01-01 -du 2023-12-31 -l 18 -u 100 -ow 360 -ps
|
60
|
+
-dl 1985-01-01 -du 2023-12-31 -l 18 -u 100 -ow 360 -ps 1 -pw 30 -f \
|
60
61
|
--att_type cehr_bert \
|
61
|
-
--ehr_table_list condition_occurrence procedure_occurrence drug_exposure -iv
|
62
|
+
--ehr_table_list condition_occurrence procedure_occurrence drug_exposure -iv \
|
63
|
+
--is_remove_index_prediction_starts
|
62
64
|
|
63
65
|
# Run predictions on HF Readmission
|
64
66
|
echo "Run predictions on hf_readmission"
|
@@ -85,9 +87,10 @@ python -u -m cehrbert_data.prediction_cohorts.copd_readmission \
|
|
85
87
|
-c copd_readmission_bow \
|
86
88
|
-i "$OMOP_FOLDER" \
|
87
89
|
-o "$OMOP_FOLDER/cohorts/copd_readmission" \
|
88
|
-
-dl 1985-01-01 -du 2023-12-31 -l 18 -u 100 -ow
|
90
|
+
-dl 1985-01-01 -du 2023-12-31 -l 18 -u 100 -ow 360 -ps 1 -pw 30 -f \
|
89
91
|
--att_type cehr_bert \
|
90
|
-
--ehr_table_list condition_occurrence procedure_occurrence drug_exposure -iv
|
92
|
+
--ehr_table_list condition_occurrence procedure_occurrence drug_exposure -iv \
|
93
|
+
--is_remove_index_prediction_starts
|
91
94
|
|
92
95
|
# Run predictions on COPD Readmission
|
93
96
|
echo "Run predictions on copd_readmission"
|
@@ -145,7 +148,8 @@ python -u -m cehrbert_data.prediction_cohorts.afib_ischemic_stroke \
|
|
145
148
|
-o "$OMOP_FOLDER/cohorts/afib_ischemic_stroke" \
|
146
149
|
-dl 1985-01-01 -du 2023-12-31 -l 18 -u 100 -ow 720 -ps 0 -pw 360 -f \
|
147
150
|
--att_type cehr_bert \
|
148
|
-
--ehr_table_list condition_occurrence procedure_occurrence drug_exposure -iv
|
151
|
+
--ehr_table_list condition_occurrence procedure_occurrence drug_exposure -iv \
|
152
|
+
--is_remove_index_prediction_starts
|
149
153
|
|
150
154
|
# Run predictions on AFIB Ischemic Stroke
|
151
155
|
echo "Run predictions on afib_ischemic_stroke"
|
@@ -0,0 +1,55 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
# Exporting input arguments as environment variables
|
4
|
+
export PATIENT_SEQUENCE_FOLDER="$1"
|
5
|
+
export OMOP_FOLDER="$2"
|
6
|
+
export SOURCE_OMOP_FOLDER="$3"
|
7
|
+
export PATIENT_SPLITS_FOLDER="$SOURCE_OMOP_FOLDER/patient_splits"
|
8
|
+
|
9
|
+
# Echoing the values of the environment variables
|
10
|
+
echo "PATIENT_SEQUENCE_FOLDER=$PATIENT_SEQUENCE_FOLDER"
|
11
|
+
echo "OMOP_FOLDER=$OMOP_FOLDER"
|
12
|
+
echo "SOURCE_OMOP_FOLDER=$SOURCE_OMOP_FOLDER"
|
13
|
+
|
14
|
+
# Ensure OMOP_FOLDER exists
|
15
|
+
if [ ! -d "$OMOP_FOLDER" ]; then
|
16
|
+
echo "Creating $OMOP_FOLDER"
|
17
|
+
mkdir -p "$OMOP_FOLDER"
|
18
|
+
fi
|
19
|
+
|
20
|
+
# Removing existing OMOP tables
|
21
|
+
rm -rf $OMOP_FOLDER/{person,visit_occurrence,condition_occurrence,procedure_occurrence,drug_exposure,death,measurement,observation_period,condition_era}
|
22
|
+
|
23
|
+
# Removing existing OMOP concept tables
|
24
|
+
rm -rf $OMOP_FOLDER/{concept,concept_ancestor,concept_relationship}
|
25
|
+
|
26
|
+
# Copying OMOP concept tables if they don't already exist
|
27
|
+
for table in concept concept_relationship concept_ancestor; do
|
28
|
+
if [ ! -d "$OMOP_FOLDER/$table" ]; then
|
29
|
+
echo "Creating $OMOP_FOLDER/$table"
|
30
|
+
cp -r "$SOURCE_OMOP_FOLDER/$table" "$OMOP_FOLDER/$table"
|
31
|
+
fi
|
32
|
+
done
|
33
|
+
|
34
|
+
# Reconstructing the OMOP instance from patient sequences
|
35
|
+
echo "Reconstructing the OMOP instance from patient sequences in $OMOP_FOLDER"
|
36
|
+
python -m cehrgpt.generation.omop_converter_batch \
|
37
|
+
--patient_sequence_path "$PATIENT_SEQUENCE_FOLDER" \
|
38
|
+
--output_folder "$OMOP_FOLDER" \
|
39
|
+
--concept_path "$OMOP_FOLDER/concept" \
|
40
|
+
--buffer_size 1280 \
|
41
|
+
--cpu_cores 10
|
42
|
+
|
43
|
+
# Create observation_period
|
44
|
+
echo "Reconstructing observation_period in $OMOP_FOLDER"
|
45
|
+
python -u -m cehrgpt.omop.observation_period \
|
46
|
+
--input_folder "$OMOP_FOLDER" \
|
47
|
+
--output_folder "$OMOP_FOLDER" \
|
48
|
+
--domain_table_list "condition_occurrence drug_exposure procedure_occurrence measurement"
|
49
|
+
|
50
|
+
# Create condition_era
|
51
|
+
echo "Reconstructing condition_era in $OMOP_FOLDER"
|
52
|
+
python -u -m cehrgpt.omop.condition_era \
|
53
|
+
--input_folder "$OMOP_FOLDER" \
|
54
|
+
--output_folder "$OMOP_FOLDER" \
|
55
|
+
--domain_table_list "condition_occurrence"
|
@@ -1,9 +1,10 @@
|
|
1
|
-
from typing import Union
|
1
|
+
from typing import Optional, Union
|
2
2
|
|
3
3
|
from cehrbert.data_generators.hf_data_generator.hf_dataset import (
|
4
4
|
FINETUNING_COLUMNS,
|
5
5
|
apply_cehrbert_dataset_mapping,
|
6
6
|
)
|
7
|
+
from cehrbert.data_generators.hf_data_generator.meds_utils import CacheFileCollector
|
7
8
|
from cehrbert.runners.hf_runner_argument_dataclass import DataTrainingArguments
|
8
9
|
from datasets import Dataset, DatasetDict
|
9
10
|
|
@@ -31,16 +32,25 @@ def create_cehrgpt_pretraining_dataset(
|
|
31
32
|
dataset: Union[Dataset, DatasetDict],
|
32
33
|
cehrgpt_tokenizer: CehrGptTokenizer,
|
33
34
|
data_args: DataTrainingArguments,
|
34
|
-
|
35
|
+
cache_file_collector: Optional[CacheFileCollector] = None,
|
36
|
+
) -> Union[Dataset, DatasetDict]:
|
35
37
|
required_columns = TRANSFORMER_COLUMNS + CEHRGPT_COLUMNS
|
38
|
+
# TODO: temp solution, this column is mixed typed and causes an issue when transforming the data
|
39
|
+
if not data_args.streaming:
|
40
|
+
if isinstance(dataset, DatasetDict):
|
41
|
+
all_columns = dataset["train"].column_names
|
42
|
+
else:
|
43
|
+
all_columns = dataset.column_names
|
44
|
+
if "visit_concept_ids" in all_columns:
|
45
|
+
dataset.remove_columns(["visit_concept_ids"])
|
36
46
|
dataset = apply_cehrbert_dataset_mapping(
|
37
47
|
dataset,
|
38
48
|
HFCehrGptTokenizationMapping(cehrgpt_tokenizer),
|
39
49
|
num_proc=data_args.preprocessing_num_workers,
|
40
50
|
batch_size=data_args.preprocessing_batch_size,
|
41
51
|
streaming=data_args.streaming,
|
52
|
+
cache_file_collector=cache_file_collector,
|
42
53
|
)
|
43
|
-
|
44
54
|
if not data_args.streaming:
|
45
55
|
if isinstance(dataset, DatasetDict):
|
46
56
|
all_columns = dataset["train"].column_names
|
@@ -56,8 +66,17 @@ def create_cehrgpt_finetuning_dataset(
|
|
56
66
|
dataset: Union[Dataset, DatasetDict],
|
57
67
|
cehrgpt_tokenizer: CehrGptTokenizer,
|
58
68
|
data_args: DataTrainingArguments,
|
59
|
-
|
69
|
+
cache_file_collector: Optional[CacheFileCollector] = None,
|
70
|
+
) -> Union[Dataset, DatasetDict]:
|
60
71
|
required_columns = TRANSFORMER_COLUMNS + CEHRGPT_COLUMNS + FINETUNING_COLUMNS
|
72
|
+
# TODO: temp solution, this column is mixed typed and causes an issue when transforming the data
|
73
|
+
if not data_args.streaming:
|
74
|
+
if isinstance(dataset, DatasetDict):
|
75
|
+
all_columns = dataset["train"].column_names
|
76
|
+
else:
|
77
|
+
all_columns = dataset.column_names
|
78
|
+
if "visit_concept_ids" in all_columns:
|
79
|
+
dataset.remove_columns(["visit_concept_ids"])
|
61
80
|
mapping_functions = [
|
62
81
|
HFFineTuningMapping(cehrgpt_tokenizer),
|
63
82
|
]
|
@@ -68,6 +87,7 @@ def create_cehrgpt_finetuning_dataset(
|
|
68
87
|
num_proc=data_args.preprocessing_num_workers,
|
69
88
|
batch_size=data_args.preprocessing_batch_size,
|
70
89
|
streaming=data_args.streaming,
|
90
|
+
cache_file_collector=cache_file_collector,
|
71
91
|
)
|
72
92
|
|
73
93
|
if not data_args.streaming:
|