cehrgpt 0.1.2__tar.gz → 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cehrgpt-0.1.3/PKG-INFO +238 -0
- cehrgpt-0.1.3/README.md +203 -0
- cehrgpt-0.1.3/constraints.txt +1 -0
- cehrgpt-0.1.3/data_generation.md +119 -0
- cehrgpt-0.1.3/feature_representation.md +109 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/pyproject.toml +1 -1
- cehrgpt-0.1.3/sample_configs/credential_file_sample.ini +5 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/sample_data/pretrain/patient_sequence.parquet +0 -0
- cehrgpt-0.1.3/scripts/create_cehrgpt_pretraining_data.sh +168 -0
- cehrgpt-0.1.3/scripts/extract_features_gpt.sh +231 -0
- cehrgpt-0.1.3/scripts/level_three_evaluation.sh +524 -0
- cehrgpt-0.1.3/scripts/omop_pipeline.sh +297 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/scripts/run_linear_prob.sh +8 -1
- cehrgpt-0.1.3/spark_setup.md +103 -0
- cehrgpt-0.1.3/src/cehrgpt/analysis/htn_treatment_pathway.py +546 -0
- cehrgpt-0.1.3/src/cehrgpt/analysis/treatment_pathway/depression_treatment_pathway.py +94 -0
- cehrgpt-0.1.3/src/cehrgpt/analysis/treatment_pathway/diabetes_treatment_pathway.py +94 -0
- cehrgpt-0.1.3/src/cehrgpt/analysis/treatment_pathway/htn_treatment_pathway.py +94 -0
- cehrgpt-0.1.3/src/cehrgpt/analysis/treatment_pathway/treatment_pathway.py +631 -0
- cehrgpt-0.1.3/src/cehrgpt/data/cehrgpt_data_processor.py +549 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/data/hf_cehrgpt_dataset.py +4 -0
- cehrgpt-0.1.3/src/cehrgpt/data/hf_cehrgpt_dataset_collator.py +677 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/data/hf_cehrgpt_dataset_mapping.py +38 -5
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/generation/cehrgpt_conditional_generation.py +2 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/generation/generate_batch_hf_gpt_sequence.py +20 -12
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/generation/omop_converter_batch.py +11 -4
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/gpt_utils.py +73 -3
- cehrgpt-0.1.3/src/cehrgpt/models/activations.py +27 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/models/config.py +6 -2
- cehrgpt-0.1.3/src/cehrgpt/models/gpt2.py +560 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/models/hf_cehrgpt.py +183 -460
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/models/tokenization_hf_cehrgpt.py +380 -50
- cehrgpt-0.1.3/src/cehrgpt/omop/ontology.py +154 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/runners/hf_cehrgpt_finetune_runner.py +24 -78
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/runners/hf_cehrgpt_pretrain_runner.py +48 -44
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/runners/hf_gpt_runner_argument_dataclass.py +46 -34
- cehrgpt-0.1.3/src/cehrgpt/runners/hyperparameter_search_util.py +336 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/runners/sample_packing_trainer.py +11 -2
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/tools/linear_prob/compute_cehrgpt_features.py +8 -2
- cehrgpt-0.1.3/src/cehrgpt.egg-info/PKG-INFO +238 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt.egg-info/SOURCES.txt +21 -1
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt.egg-info/requires.txt +1 -1
- cehrgpt-0.1.3/synthetic_data_generation.md +152 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/tests/integration_tests/runners/hf_cehrgpt_pretrain_runner_test.py +13 -5
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/tests/integration_tests/runners/hf_cehrgpt_pretrain_sample_packing_runner_test.py +7 -5
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/tests/integration_tests/runners/hf_cehrgpt_pretrain_sfm_runner_test.py +2 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/tests/unit_tests/models/model_utils_test.py +1 -1
- cehrgpt-0.1.3/tests/unit_tests/models/rotary_embedding_test.py +88 -0
- cehrgpt-0.1.3/tests/unit_tests/tools/__init__.py +0 -0
- cehrgpt-0.1.3/zero_shot_prediction.md +54 -0
- cehrgpt-0.1.2/PKG-INFO +0 -209
- cehrgpt-0.1.2/README.md +0 -174
- cehrgpt-0.1.2/scripts/level_three_evaluation.sh +0 -169
- cehrgpt-0.1.2/scripts/omop_pipeline.sh +0 -55
- cehrgpt-0.1.2/src/cehrgpt/data/hf_cehrgpt_dataset_collator.py +0 -1044
- cehrgpt-0.1.2/src/cehrgpt/runners/hyperparameter_search_util.py +0 -225
- cehrgpt-0.1.2/src/cehrgpt.egg-info/PKG-INFO +0 -209
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/.github/workflows/build-python.yaml +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/.github/workflows/tests.yaml +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/.gitignore +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/.pre-commit-config.yaml +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/LICENSE +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/sample_configs/cehrgpt_pretrain_sample_config.yaml +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/sample_data/omop_vocab/concept/concept.parquet +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/sample_data/pretrained_embeddings/pretrained_embedding_concepts.pkl +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/sample_data/pretrained_embeddings/pretrained_embedding_vectors.npy +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/scripts/pool_generated_sequences.sh +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/setup.cfg +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/__init__.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/__init__.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/analysis/__init__.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/analysis/irregularity.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/analysis/privacy/__init__.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/analysis/privacy/attribute_inference.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/analysis/privacy/attribute_inference_config.yml +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/analysis/privacy/member_inference.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/analysis/privacy/nearest_neighbor_inference.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/analysis/privacy/reid_inference.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/analysis/privacy/utils.py +0 -0
- {cehrgpt-0.1.2/src/cehrgpt/data → cehrgpt-0.1.3/src/cehrgpt/analysis/treatment_pathway}/__init__.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/cehrgpt_args.py +0 -0
- {cehrgpt-0.1.2/src/cehrgpt/generation → cehrgpt-0.1.3/src/cehrgpt/data}/__init__.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/data/sample_packing_sampler.py +0 -0
- {cehrgpt-0.1.2/src/cehrgpt/models → cehrgpt-0.1.3/src/cehrgpt/generation}/__init__.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/generation/chatgpt_generation.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/generation/omop_entity.py +0 -0
- {cehrgpt-0.1.2/src/cehrgpt/omop → cehrgpt-0.1.3/src/cehrgpt/models}/__init__.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/models/hf_modeling_outputs.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/models/pretrained_embeddings.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/models/special_tokens.py +0 -0
- {cehrgpt-0.1.2/src/cehrgpt/omop/queries → cehrgpt-0.1.3/src/cehrgpt/omop}/__init__.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/omop/condition_era.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/omop/observation_period.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/omop/omop_argparse.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/omop/omop_table_builder.py +0 -0
- {cehrgpt-0.1.2/src/cehrgpt/runners → cehrgpt-0.1.3/src/cehrgpt/omop/queries}/__init__.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/omop/queries/condition_era.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/omop/queries/observation_period.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/omop/sample_omop_tables.py +0 -0
- {cehrgpt-0.1.2/src/cehrgpt/simulations → cehrgpt-0.1.3/src/cehrgpt/runners}/__init__.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/runners/data_utils.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/runners/gpt_runner_util.py +0 -0
- {cehrgpt-0.1.2/src/cehrgpt/time_to_event → cehrgpt-0.1.3/src/cehrgpt/simulations}/__init__.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/simulations/generate_plots.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/simulations/run_simulation.sh +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/simulations/time_embedding_simulation.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/simulations/time_token_simulation.py +0 -0
- {cehrgpt-0.1.2/src/cehrgpt/tools → cehrgpt-0.1.3/src/cehrgpt/time_to_event}/__init__.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/time_to_event/config/1_year_cabg.yaml +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/time_to_event/config/30_day_readmission.yaml +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/time_to_event/config/next_visit_type_prediction.yaml +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/time_to_event/config/t2dm_hf.yaml +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/time_to_event/time_to_event_model.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/time_to_event/time_to_event_prediction.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/time_to_event/time_to_event_utils.py +0 -0
- {cehrgpt-0.1.2/src/cehrgpt/tools/linear_prob → cehrgpt-0.1.3/src/cehrgpt/tools}/__init__.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/tools/ehrshot_benchmark.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/tools/generate_causal_patient_split_by_age.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/tools/generate_pretrained_embeddings.py +0 -0
- {cehrgpt-0.1.2/tests → cehrgpt-0.1.3/src/cehrgpt/tools/linear_prob}/__init__.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/tools/linear_prob/train_with_cehrgpt_features.py +0 -0
- /cehrgpt-0.1.2/src/cehrgpt/tools/merge_synthetic_real_dataasets.py → /cehrgpt-0.1.3/src/cehrgpt/tools/merge_synthetic_real_datasets.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt/tools/upload_omop_tables.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt.egg-info/dependency_links.txt +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/src/cehrgpt.egg-info/top_level.txt +0 -0
- {cehrgpt-0.1.2/tests/integration_tests → cehrgpt-0.1.3/tests}/__init__.py +0 -0
- {cehrgpt-0.1.2/tests/integration_tests/runners → cehrgpt-0.1.3/tests/integration_tests}/__init__.py +0 -0
- {cehrgpt-0.1.2/tests/unit_tests → cehrgpt-0.1.3/tests/integration_tests/runners}/__init__.py +0 -0
- {cehrgpt-0.1.2/tests/unit_tests/models → cehrgpt-0.1.3/tests/unit_tests}/__init__.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/tests/unit_tests/gpt_utils_test.py +0 -0
- {cehrgpt-0.1.2/tests/unit_tests/models/tokenization → cehrgpt-0.1.3/tests/unit_tests/models}/__init__.py +0 -0
- {cehrgpt-0.1.2/tests/unit_tests/runners → cehrgpt-0.1.3/tests/unit_tests/models/tokenization}/__init__.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/tests/unit_tests/models/tokenization/create_bins_with_spline_test.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/tests/unit_tests/models/tokenization/create_sample_from_bins_test.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/tests/unit_tests/numeric_concept_statistics_test.py +0 -0
- {cehrgpt-0.1.2/tests/unit_tests/tools → cehrgpt-0.1.3/tests/unit_tests/runners}/__init__.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/tests/unit_tests/runners/hf_cehrgpt_finetune_runner_test.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/tests/unit_tests/tokenization_test.py +0 -0
- {cehrgpt-0.1.2 → cehrgpt-0.1.3}/tests/unit_tests/tools/upload_omop_tables_test.py +0 -0
cehrgpt-0.1.3/PKG-INFO
ADDED
@@ -0,0 +1,238 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: cehrgpt
|
3
|
+
Version: 0.1.3
|
4
|
+
Summary: CEHR-GPT: Generating Electronic Health Records with Chronological Patient Timelines
|
5
|
+
Author-email: Chao Pang <chaopang229@gmail.com>, Xinzhuo Jiang <xj2193@cumc.columbia.edu>, Krishna Kalluri <kk3326@cumc.columbia.edu>, Elise Minto <em3697@cumc.columbia.edu>, Jason Patterson <jp3477@cumc.columbia.edu>, Nishanth Parameshwar Pavinkurve <np2689@cumc.columbia.edu>, Karthik Natarajan <kn2174@cumc.columbia.edu>
|
6
|
+
License: MIT License
|
7
|
+
Classifier: Development Status :: 5 - Production/Stable
|
8
|
+
Classifier: Intended Audience :: Developers
|
9
|
+
Classifier: Intended Audience :: Science/Research
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
12
|
+
Requires-Python: >=3.10.0
|
13
|
+
Description-Content-Type: text/markdown
|
14
|
+
License-File: LICENSE
|
15
|
+
Requires-Dist: cehrbert>=1.4.8
|
16
|
+
Requires-Dist: cehrbert_data==0.0.11
|
17
|
+
Requires-Dist: openai==1.54.3
|
18
|
+
Requires-Dist: optuna==4.0.0
|
19
|
+
Requires-Dist: transformers==4.44.1
|
20
|
+
Requires-Dist: tokenizers==0.19.0
|
21
|
+
Requires-Dist: peft==0.10.0
|
22
|
+
Requires-Dist: lightgbm
|
23
|
+
Requires-Dist: polars
|
24
|
+
Provides-Extra: dev
|
25
|
+
Requires-Dist: pre-commit; extra == "dev"
|
26
|
+
Requires-Dist: pytest; extra == "dev"
|
27
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
28
|
+
Requires-Dist: pytest-subtests; extra == "dev"
|
29
|
+
Requires-Dist: rootutils; extra == "dev"
|
30
|
+
Requires-Dist: hypothesis; extra == "dev"
|
31
|
+
Requires-Dist: black; extra == "dev"
|
32
|
+
Provides-Extra: flash-attn
|
33
|
+
Requires-Dist: flash_attn; extra == "flash-attn"
|
34
|
+
Dynamic: license-file
|
35
|
+
|
36
|
+
# CEHRGPT
|
37
|
+
|
38
|
+
[](https://pypi.org/project/cehrgpt/)
|
39
|
+

|
40
|
+
[](https://github.com/knatarajan-lab/cehrgpt/actions/workflows/tests.yaml)
|
41
|
+
[](https://github.com/knatarajan-lab/cehrgpt/blob/main/LICENSE)
|
42
|
+
[](https://github.com/knatarajan-lab/cehrgpt/graphs/contributors)
|
43
|
+
|
44
|
+
CEHRGPT is a multi-task foundation model for structured electronic health records (EHR) data that supports three capabilities: feature representation, zero-shot prediction, and synthetic data generation.
|
45
|
+
|
46
|
+
## 🎯 Key Capabilities
|
47
|
+
|
48
|
+
### Feature Representation
|
49
|
+
Extract meaningful patient embeddings from sequences of medical events using **linear probing** techniques for downstream tasks such as disease prediction, patient clustering, and risk stratification.
|
50
|
+
|
51
|
+
### Zero-Shot Prediction
|
52
|
+
Generate outcome predictions directly from prompts without requiring task-specific training, enabling rapid evaluation in low-label clinical settings.
|
53
|
+
|
54
|
+
### Synthetic Data Generation
|
55
|
+
Generate comprehensive patient profiles including demographics, medical history, treatment courses, and outcomes while implementing advanced privacy-preserving techniques to ensure generated data contains no identifiable information.
|
56
|
+
The platform is fully compatible with the OMOP Common Data Model for seamless integration with existing healthcare systems.
|
57
|
+
## 🚀 Installation
|
58
|
+
|
59
|
+
Clone the repository and install dependencies:
|
60
|
+
|
61
|
+
```bash
|
62
|
+
git clone https://github.com/knatarajan-lab/cehrgpt.git
|
63
|
+
cd cehrgpt
|
64
|
+
pip install .
|
65
|
+
```
|
66
|
+
|
67
|
+
## 📋 Prerequisites
|
68
|
+
|
69
|
+
Before getting started, set up the required environment variables:
|
70
|
+
|
71
|
+
```bash
|
72
|
+
export CEHRGPT_HOME=$(git rev-parse --show-toplevel)
|
73
|
+
export OMOP_DIR="" # Path to your OMOP data
|
74
|
+
export CEHR_GPT_DATA_DIR="" # Path for processed data storage
|
75
|
+
export CEHR_GPT_MODEL_DIR="" # Path for model storage
|
76
|
+
```
|
77
|
+
|
78
|
+
Create the dataset cache directory:
|
79
|
+
```bash
|
80
|
+
mkdir $CEHR_GPT_DATA_DIR/dataset_prepared
|
81
|
+
```
|
82
|
+
|
83
|
+
## 🏗️ Model Training
|
84
|
+
|
85
|
+
### Step 1: Generate Pre-training Data from OMOP
|
86
|
+
|
87
|
+
Generate the training data following the [Data Generation Instruction](./data_generation.md).
|
88
|
+
|
89
|
+
### Step 2: Pre-train CEHR-GPT
|
90
|
+
|
91
|
+
Train the foundation model:
|
92
|
+
|
93
|
+
```bash
|
94
|
+
python -u -m cehrgpt.runners.hf_cehrgpt_pretrain_runner \
|
95
|
+
--model_name_or_path $CEHR_GPT_MODEL_DIR \
|
96
|
+
--tokenizer_name_or_path $CEHR_GPT_MODEL_DIR \
|
97
|
+
--output_dir $CEHR_GPT_MODEL_DIR \
|
98
|
+
--data_folder "$CEHR_GPT_DATA_DIR/patient_sequence/train" \
|
99
|
+
--dataset_prepared_path "$CEHR_GPT_DATA_DIR/dataset_prepared" \
|
100
|
+
--do_train true --seed 42 \
|
101
|
+
--dataloader_num_workers 16 --dataloader_prefetch_factor 8 \
|
102
|
+
--hidden_size 768 --num_hidden_layers 14 --max_position_embeddings 4096 \
|
103
|
+
--evaluation_strategy epoch --save_strategy epoch \
|
104
|
+
--sample_packing --max_tokens_per_batch 16384 \
|
105
|
+
--warmup_ratio 0.01 --weight_decay 0.01 \
|
106
|
+
--num_train_epochs 50 --learning_rate 0.0002 \
|
107
|
+
--use_early_stopping --early_stopping_threshold 0.001
|
108
|
+
```
|
109
|
+
|
110
|
+
> **Tip**: Increase `max_position_embeddings` for longer context windows based on your use case.
|
111
|
+
|
112
|
+
## 🎯 Feature Representation
|
113
|
+
|
114
|
+
CEHR-GPT enables extraction of meaningful patient embeddings from medical event sequences using **linear probing** techniques for downstream prediction tasks. The feature representation pipeline includes label generation, patient sequence extraction, and linear regression model training on the extracted representations.
|
115
|
+
|
116
|
+
For detailed instructions including cohort creation, patient feature extraction, and linear probing evaluation, please follow the [Feature Representation Guide](./feature_representation.md).
|
117
|
+
|
118
|
+
## 🔮 Zero-Shot Prediction
|
119
|
+
|
120
|
+
CEHR-GPT can generate outcome predictions directly from clinical prompts without requiring task-specific training, making it ideal for rapid evaluation in low-label clinical settings. The zero-shot prediction capability performs time-to-event analysis by processing patient sequences and generating risk predictions based on learned medical patterns.
|
121
|
+
|
122
|
+
For complete setup instructions including label generation, sequence preparation, and prediction execution, please follow the [Zero-Shot Prediction Guide](./zero_shot_prediction.md).
|
123
|
+
|
124
|
+
## 🧬 Synthetic Data Generation
|
125
|
+
|
126
|
+
CEHR-GPT generates comprehensive synthetic patient profiles including demographics, medical history, treatment courses, and outcomes while implementing advanced privacy-preserving techniques. The synthetic data maintains statistical fidelity to real patient populations without containing identifiable information, and outputs are fully compatible with the OMOP Common Data Model.
|
127
|
+
|
128
|
+
For step-by-step instructions on generating synthetic sequences and converting them to OMOP format, please follow the [Synthetic Data Generation Guide](./synthetic_data_generation.md).
|
129
|
+
|
130
|
+
## 📊 MEDS Support
|
131
|
+
|
132
|
+
CEHR-GPT supports the Medical Event Data Standard (MEDS) format for enhanced interoperability.
|
133
|
+
|
134
|
+
### Prerequisites
|
135
|
+
|
136
|
+
Configure MEDS-specific environment variables:
|
137
|
+
|
138
|
+
```bash
|
139
|
+
export CEHR_GPT_MODEL_DIR="" # CEHR-GPT model directory
|
140
|
+
export MEDS_DIR="" # MEDS data directory
|
141
|
+
export MEDS_READER_DIR="" # MEDS reader output directory
|
142
|
+
```
|
143
|
+
|
144
|
+
### Step 1: Create MIMIC MEDS Data
|
145
|
+
|
146
|
+
Transform MIMIC files to MEDS format following the [MEDS_transforms](https://github.com/mmcdermott/MEDS_transforms/) repository instructions.
|
147
|
+
|
148
|
+
### Step 2: Prepare MEDS Reader
|
149
|
+
|
150
|
+
Convert MEDS data for CEHR-GPT compatibility:
|
151
|
+
|
152
|
+
```bash
|
153
|
+
meds_reader_convert $MEDS_DIR $MEDS_READER_DIR --num_threads 10
|
154
|
+
```
|
155
|
+
|
156
|
+
### Step 3: Pre-train with MEDS Data
|
157
|
+
|
158
|
+
Execute pre-training using MEDS format:
|
159
|
+
|
160
|
+
```bash
|
161
|
+
python -u -m cehrgpt.runners.hf_cehrgpt_pretrain_runner \
|
162
|
+
--model_name_or_path $CEHR_GPT_MODEL_DIR \
|
163
|
+
--tokenizer_name_or_path $CEHR_GPT_MODEL_DIR \
|
164
|
+
--output_dir $CEHR_GPT_MODEL_DIR \
|
165
|
+
--data_folder $MEDS_READER_DIR \
|
166
|
+
--dataset_prepared_path "$CEHR_GPT_MODEL_DIR/dataset_prepared" \
|
167
|
+
--do_train true --seed 42 \
|
168
|
+
--dataloader_num_workers 16 --dataloader_prefetch_factor 8 \
|
169
|
+
--hidden_size 768 --num_hidden_layers 14 --max_position_embeddings 8192 \
|
170
|
+
--evaluation_strategy epoch --save_strategy epoch \
|
171
|
+
--sample_packing --max_tokens_per_batch 16384 \
|
172
|
+
--warmup_steps 500 --weight_decay 0.01 \
|
173
|
+
--num_train_epochs 50 --learning_rate 0.0002 \
|
174
|
+
--use_early_stopping --early_stopping_threshold 0.001 \
|
175
|
+
--is_data_in_meds --inpatient_att_function_type day \
|
176
|
+
--att_function_type day --include_inpatient_hour_token \
|
177
|
+
--include_auxiliary_token --include_demographic_prompt \
|
178
|
+
--meds_to_cehrbert_conversion_type "MedsToBertMimic4"
|
179
|
+
```
|
180
|
+
|
181
|
+
### Step 4: Generate MEDS Trajectories
|
182
|
+
|
183
|
+
#### Environment Setup
|
184
|
+
|
185
|
+
Configure trajectory generation environment:
|
186
|
+
|
187
|
+
```bash
|
188
|
+
export MEDS_LABEL_COHORT_DIR="" # Cohort labels directory (parquet files)
|
189
|
+
export MEDS_TRAJECTORY_DIR="" # Trajectory output directory
|
190
|
+
```
|
191
|
+
|
192
|
+
#### Generate Synthetic Trajectories
|
193
|
+
|
194
|
+
Create patient trajectories with the trained model:
|
195
|
+
|
196
|
+
```bash
|
197
|
+
python -u -m cehrgpt.generation.cehrgpt_conditional_generation \
|
198
|
+
--cohort_folder $MEDS_LABEL_COHORT_DIR \
|
199
|
+
--data_folder $MEDS_READER_DIR \
|
200
|
+
--dataset_prepared_path "$CEHR_GPT_MODEL_DIR/dataset_prepared" \
|
201
|
+
--model_name_or_path $CEHR_GPT_MODEL_DIR \
|
202
|
+
--tokenizer_name_or_path $CEHR_GPT_MODEL_DIR \
|
203
|
+
--output_dir $MEDS_TRAJECTORY_DIR \
|
204
|
+
--per_device_eval_batch_size 16 \
|
205
|
+
--num_of_trajectories_per_sample 2 \
|
206
|
+
--generation_input_length 4096 \
|
207
|
+
--generation_max_new_tokens 4096 \
|
208
|
+
--is_data_in_meds \
|
209
|
+
--att_function_type day --inpatient_att_function_type day \
|
210
|
+
--meds_to_cehrbert_conversion_type MedsToBertMimic4 \
|
211
|
+
--include_auxiliary_token --include_demographic_prompt \
|
212
|
+
--include_inpatient_hour_token
|
213
|
+
```
|
214
|
+
|
215
|
+
> **Important**: Ensure `generation_input_length` + `generation_max_new_tokens` ≤ `max_position_embeddings` (8192).
|
216
|
+
|
217
|
+
#### Parameter Reference
|
218
|
+
|
219
|
+
- `generation_input_length`: Input context length for generation
|
220
|
+
- `generation_max_new_tokens`: Maximum new tokens to generate
|
221
|
+
- `num_of_trajectories_per_sample`: Number of trajectories per patient sample
|
222
|
+
|
223
|
+
## 📖 Citation
|
224
|
+
|
225
|
+
If you use CEHRGPT in your research, please cite:
|
226
|
+
|
227
|
+
```bibtex
|
228
|
+
@article{cehrgpt2024,
|
229
|
+
title={CEHRGPT: Synthetic Data Generation for Electronic Health Records},
|
230
|
+
author={Natarajan, K and others},
|
231
|
+
journal={arXiv preprint arXiv:2402.04400},
|
232
|
+
year={2024}
|
233
|
+
}
|
234
|
+
```
|
235
|
+
|
236
|
+
## 📄 License
|
237
|
+
|
238
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
cehrgpt-0.1.3/README.md
ADDED
@@ -0,0 +1,203 @@
|
|
1
|
+
# CEHRGPT
|
2
|
+
|
3
|
+
[](https://pypi.org/project/cehrgpt/)
|
4
|
+

|
5
|
+
[](https://github.com/knatarajan-lab/cehrgpt/actions/workflows/tests.yaml)
|
6
|
+
[](https://github.com/knatarajan-lab/cehrgpt/blob/main/LICENSE)
|
7
|
+
[](https://github.com/knatarajan-lab/cehrgpt/graphs/contributors)
|
8
|
+
|
9
|
+
CEHRGPT is a multi-task foundation model for structured electronic health records (EHR) data that supports three capabilities: feature representation, zero-shot prediction, and synthetic data generation.
|
10
|
+
|
11
|
+
## 🎯 Key Capabilities
|
12
|
+
|
13
|
+
### Feature Representation
|
14
|
+
Extract meaningful patient embeddings from sequences of medical events using **linear probing** techniques for downstream tasks such as disease prediction, patient clustering, and risk stratification.
|
15
|
+
|
16
|
+
### Zero-Shot Prediction
|
17
|
+
Generate outcome predictions directly from prompts without requiring task-specific training, enabling rapid evaluation in low-label clinical settings.
|
18
|
+
|
19
|
+
### Synthetic Data Generation
|
20
|
+
Generate comprehensive patient profiles including demographics, medical history, treatment courses, and outcomes while implementing advanced privacy-preserving techniques to ensure generated data contains no identifiable information.
|
21
|
+
The platform is fully compatible with the OMOP Common Data Model for seamless integration with existing healthcare systems.
|
22
|
+
## 🚀 Installation
|
23
|
+
|
24
|
+
Clone the repository and install dependencies:
|
25
|
+
|
26
|
+
```bash
|
27
|
+
git clone https://github.com/knatarajan-lab/cehrgpt.git
|
28
|
+
cd cehrgpt
|
29
|
+
pip install .
|
30
|
+
```
|
31
|
+
|
32
|
+
## 📋 Prerequisites
|
33
|
+
|
34
|
+
Before getting started, set up the required environment variables:
|
35
|
+
|
36
|
+
```bash
|
37
|
+
export CEHRGPT_HOME=$(git rev-parse --show-toplevel)
|
38
|
+
export OMOP_DIR="" # Path to your OMOP data
|
39
|
+
export CEHR_GPT_DATA_DIR="" # Path for processed data storage
|
40
|
+
export CEHR_GPT_MODEL_DIR="" # Path for model storage
|
41
|
+
```
|
42
|
+
|
43
|
+
Create the dataset cache directory:
|
44
|
+
```bash
|
45
|
+
mkdir $CEHR_GPT_DATA_DIR/dataset_prepared
|
46
|
+
```
|
47
|
+
|
48
|
+
## 🏗️ Model Training
|
49
|
+
|
50
|
+
### Step 1: Generate Pre-training Data from OMOP
|
51
|
+
|
52
|
+
Generate the training data following the [Data Generation Instruction](./data_generation.md).
|
53
|
+
|
54
|
+
### Step 2: Pre-train CEHR-GPT
|
55
|
+
|
56
|
+
Train the foundation model:
|
57
|
+
|
58
|
+
```bash
|
59
|
+
python -u -m cehrgpt.runners.hf_cehrgpt_pretrain_runner \
|
60
|
+
--model_name_or_path $CEHR_GPT_MODEL_DIR \
|
61
|
+
--tokenizer_name_or_path $CEHR_GPT_MODEL_DIR \
|
62
|
+
--output_dir $CEHR_GPT_MODEL_DIR \
|
63
|
+
--data_folder "$CEHR_GPT_DATA_DIR/patient_sequence/train" \
|
64
|
+
--dataset_prepared_path "$CEHR_GPT_DATA_DIR/dataset_prepared" \
|
65
|
+
--do_train true --seed 42 \
|
66
|
+
--dataloader_num_workers 16 --dataloader_prefetch_factor 8 \
|
67
|
+
--hidden_size 768 --num_hidden_layers 14 --max_position_embeddings 4096 \
|
68
|
+
--evaluation_strategy epoch --save_strategy epoch \
|
69
|
+
--sample_packing --max_tokens_per_batch 16384 \
|
70
|
+
--warmup_ratio 0.01 --weight_decay 0.01 \
|
71
|
+
--num_train_epochs 50 --learning_rate 0.0002 \
|
72
|
+
--use_early_stopping --early_stopping_threshold 0.001
|
73
|
+
```
|
74
|
+
|
75
|
+
> **Tip**: Increase `max_position_embeddings` for longer context windows based on your use case.
|
76
|
+
|
77
|
+
## 🎯 Feature Representation
|
78
|
+
|
79
|
+
CEHR-GPT enables extraction of meaningful patient embeddings from medical event sequences using **linear probing** techniques for downstream prediction tasks. The feature representation pipeline includes label generation, patient sequence extraction, and linear regression model training on the extracted representations.
|
80
|
+
|
81
|
+
For detailed instructions including cohort creation, patient feature extraction, and linear probing evaluation, please follow the [Feature Representation Guide](./feature_representation.md).
|
82
|
+
|
83
|
+
## 🔮 Zero-Shot Prediction
|
84
|
+
|
85
|
+
CEHR-GPT can generate outcome predictions directly from clinical prompts without requiring task-specific training, making it ideal for rapid evaluation in low-label clinical settings. The zero-shot prediction capability performs time-to-event analysis by processing patient sequences and generating risk predictions based on learned medical patterns.
|
86
|
+
|
87
|
+
For complete setup instructions including label generation, sequence preparation, and prediction execution, please follow the [Zero-Shot Prediction Guide](./zero_shot_prediction.md).
|
88
|
+
|
89
|
+
## 🧬 Synthetic Data Generation
|
90
|
+
|
91
|
+
CEHR-GPT generates comprehensive synthetic patient profiles including demographics, medical history, treatment courses, and outcomes while implementing advanced privacy-preserving techniques. The synthetic data maintains statistical fidelity to real patient populations without containing identifiable information, and outputs are fully compatible with the OMOP Common Data Model.
|
92
|
+
|
93
|
+
For step-by-step instructions on generating synthetic sequences and converting them to OMOP format, please follow the [Synthetic Data Generation Guide](./synthetic_data_generation.md).
|
94
|
+
|
95
|
+
## 📊 MEDS Support
|
96
|
+
|
97
|
+
CEHR-GPT supports the Medical Event Data Standard (MEDS) format for enhanced interoperability.
|
98
|
+
|
99
|
+
### Prerequisites
|
100
|
+
|
101
|
+
Configure MEDS-specific environment variables:
|
102
|
+
|
103
|
+
```bash
|
104
|
+
export CEHR_GPT_MODEL_DIR="" # CEHR-GPT model directory
|
105
|
+
export MEDS_DIR="" # MEDS data directory
|
106
|
+
export MEDS_READER_DIR="" # MEDS reader output directory
|
107
|
+
```
|
108
|
+
|
109
|
+
### Step 1: Create MIMIC MEDS Data
|
110
|
+
|
111
|
+
Transform MIMIC files to MEDS format following the [MEDS_transforms](https://github.com/mmcdermott/MEDS_transforms/) repository instructions.
|
112
|
+
|
113
|
+
### Step 2: Prepare MEDS Reader
|
114
|
+
|
115
|
+
Convert MEDS data for CEHR-GPT compatibility:
|
116
|
+
|
117
|
+
```bash
|
118
|
+
meds_reader_convert $MEDS_DIR $MEDS_READER_DIR --num_threads 10
|
119
|
+
```
|
120
|
+
|
121
|
+
### Step 3: Pre-train with MEDS Data
|
122
|
+
|
123
|
+
Execute pre-training using MEDS format:
|
124
|
+
|
125
|
+
```bash
|
126
|
+
python -u -m cehrgpt.runners.hf_cehrgpt_pretrain_runner \
|
127
|
+
--model_name_or_path $CEHR_GPT_MODEL_DIR \
|
128
|
+
--tokenizer_name_or_path $CEHR_GPT_MODEL_DIR \
|
129
|
+
--output_dir $CEHR_GPT_MODEL_DIR \
|
130
|
+
--data_folder $MEDS_READER_DIR \
|
131
|
+
--dataset_prepared_path "$CEHR_GPT_MODEL_DIR/dataset_prepared" \
|
132
|
+
--do_train true --seed 42 \
|
133
|
+
--dataloader_num_workers 16 --dataloader_prefetch_factor 8 \
|
134
|
+
--hidden_size 768 --num_hidden_layers 14 --max_position_embeddings 8192 \
|
135
|
+
--evaluation_strategy epoch --save_strategy epoch \
|
136
|
+
--sample_packing --max_tokens_per_batch 16384 \
|
137
|
+
--warmup_steps 500 --weight_decay 0.01 \
|
138
|
+
--num_train_epochs 50 --learning_rate 0.0002 \
|
139
|
+
--use_early_stopping --early_stopping_threshold 0.001 \
|
140
|
+
--is_data_in_meds --inpatient_att_function_type day \
|
141
|
+
--att_function_type day --include_inpatient_hour_token \
|
142
|
+
--include_auxiliary_token --include_demographic_prompt \
|
143
|
+
--meds_to_cehrbert_conversion_type "MedsToBertMimic4"
|
144
|
+
```
|
145
|
+
|
146
|
+
### Step 4: Generate MEDS Trajectories
|
147
|
+
|
148
|
+
#### Environment Setup
|
149
|
+
|
150
|
+
Configure trajectory generation environment:
|
151
|
+
|
152
|
+
```bash
|
153
|
+
export MEDS_LABEL_COHORT_DIR="" # Cohort labels directory (parquet files)
|
154
|
+
export MEDS_TRAJECTORY_DIR="" # Trajectory output directory
|
155
|
+
```
|
156
|
+
|
157
|
+
#### Generate Synthetic Trajectories
|
158
|
+
|
159
|
+
Create patient trajectories with the trained model:
|
160
|
+
|
161
|
+
```bash
|
162
|
+
python -u -m cehrgpt.generation.cehrgpt_conditional_generation \
|
163
|
+
--cohort_folder $MEDS_LABEL_COHORT_DIR \
|
164
|
+
--data_folder $MEDS_READER_DIR \
|
165
|
+
--dataset_prepared_path "$CEHR_GPT_MODEL_DIR/dataset_prepared" \
|
166
|
+
--model_name_or_path $CEHR_GPT_MODEL_DIR \
|
167
|
+
--tokenizer_name_or_path $CEHR_GPT_MODEL_DIR \
|
168
|
+
--output_dir $MEDS_TRAJECTORY_DIR \
|
169
|
+
--per_device_eval_batch_size 16 \
|
170
|
+
--num_of_trajectories_per_sample 2 \
|
171
|
+
--generation_input_length 4096 \
|
172
|
+
--generation_max_new_tokens 4096 \
|
173
|
+
--is_data_in_meds \
|
174
|
+
--att_function_type day --inpatient_att_function_type day \
|
175
|
+
--meds_to_cehrbert_conversion_type MedsToBertMimic4 \
|
176
|
+
--include_auxiliary_token --include_demographic_prompt \
|
177
|
+
--include_inpatient_hour_token
|
178
|
+
```
|
179
|
+
|
180
|
+
> **Important**: Ensure `generation_input_length` + `generation_max_new_tokens` ≤ `max_position_embeddings` (8192).
|
181
|
+
|
182
|
+
#### Parameter Reference
|
183
|
+
|
184
|
+
- `generation_input_length`: Input context length for generation
|
185
|
+
- `generation_max_new_tokens`: Maximum new tokens to generate
|
186
|
+
- `num_of_trajectories_per_sample`: Number of trajectories per patient sample
|
187
|
+
|
188
|
+
## 📖 Citation
|
189
|
+
|
190
|
+
If you use CEHRGPT in your research, please cite:
|
191
|
+
|
192
|
+
```bibtex
|
193
|
+
@article{cehrgpt2024,
|
194
|
+
title={CEHRGPT: Synthetic Data Generation for Electronic Health Records},
|
195
|
+
author={Natarajan, K and others},
|
196
|
+
journal={arXiv preprint arXiv:2402.04400},
|
197
|
+
year={2024}
|
198
|
+
}
|
199
|
+
```
|
200
|
+
|
201
|
+
## 📄 License
|
202
|
+
|
203
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
@@ -0,0 +1 @@
|
|
1
|
+
xformers==0 # blocks installation
|
@@ -0,0 +1,119 @@
|
|
1
|
+
# CEHR-GPT Data Generation
|
2
|
+
|
3
|
+
This guide covers the process of generating pre-training data for CEHR-GPT from OMOP-formatted healthcare datasets.
|
4
|
+
|
5
|
+
## Prerequisites
|
6
|
+
|
7
|
+
Before starting data generation, ensure you have:
|
8
|
+
|
9
|
+
1. **Spark Environment**: Configured Apache Spark (see [Spark Setup README](./spark_setup.md))
|
10
|
+
2. **OMOP Data**: Healthcare data in OMOP Common Data Model format
|
11
|
+
3. **Environment Variables**: Required paths and directories set up
|
12
|
+
|
13
|
+
## Required Environment Variables
|
14
|
+
|
15
|
+
Set up the necessary directory paths:
|
16
|
+
|
17
|
+
```bash
|
18
|
+
# CEHR-GPT installation directory
|
19
|
+
export CEHRGPT_HOME=$(git rev-parse --show-toplevel)
|
20
|
+
|
21
|
+
# OMOP input data directory
|
22
|
+
export OMOP_DIR="/path/to/omop/data"
|
23
|
+
|
24
|
+
# Output directory for processed data
|
25
|
+
export CEHR_GPT_DATA_DIR="/path/to/output/data"
|
26
|
+
```
|
27
|
+
|
28
|
+
## Step 1: Configure Spark for Data Processing
|
29
|
+
|
30
|
+
Set up Spark environment variables optimized for healthcare data processing:
|
31
|
+
|
32
|
+
```bash
|
33
|
+
# Worker configuration
|
34
|
+
export SPARK_WORKER_INSTANCES="1"
|
35
|
+
export SPARK_MASTER="local[16]"
|
36
|
+
export SPARK_WORKER_CORES="16"
|
37
|
+
export SPARK_EXECUTOR_CORES="4"
|
38
|
+
|
39
|
+
# Memory configuration
|
40
|
+
export SPARK_DRIVER_MEMORY="20g"
|
41
|
+
export SPARK_EXECUTOR_MEMORY="20g"
|
42
|
+
|
43
|
+
export SPARK_SUBMIT_OPTIONS="--master $SPARK_MASTER --driver-memory $SPARK_DRIVER_MEMORY --executor-memory $SPARK_EXECUTOR_MEMORY --executor-cores $SPARK_EXECUTOR_CORES"
|
44
|
+
```
|
45
|
+
|
46
|
+
### Configuration Guidelines
|
47
|
+
|
48
|
+
**Memory Allocation:**
|
49
|
+
- **Small datasets (< 1M patients)**: 8GB driver/executor memory
|
50
|
+
- **Medium datasets (1-10M patients)**: 12-16GB driver/executor memory
|
51
|
+
- **Large datasets (> 10M patients)**: 20-32GB driver/executor memory
|
52
|
+
|
53
|
+
**Core Allocation:**
|
54
|
+
- Adjust `SPARK_WORKER_CORES` based on available CPU cores
|
55
|
+
- Keep `SPARK_EXECUTOR_CORES` at 2-4 for optimal performance
|
56
|
+
- Reserve 2-4 cores for system processes
|
57
|
+
|
58
|
+
## Step 2: Generate Pre-training Data
|
59
|
+
|
60
|
+
Execute the data generation script:
|
61
|
+
|
62
|
+
```bash
|
63
|
+
sh $CEHRGPT_HOME/scripts/create_cehrgpt_pretraining_data.sh \
|
64
|
+
--input_folder $OMOP_DIR \
|
65
|
+
--output_folder $CEHR_GPT_DATA_DIR \
|
66
|
+
--start_date "1985-01-01"
|
67
|
+
```
|
68
|
+
|
69
|
+
### Script Parameters
|
70
|
+
|
71
|
+
- `--input_folder`: Directory containing OMOP-formatted data files
|
72
|
+
- `--output_folder`: Directory where processed data will be saved
|
73
|
+
- `--start_date`: Earliest date for including patient records (format: YYYY-MM-DD)
|
74
|
+
|
75
|
+
## Performance Optimization
|
76
|
+
|
77
|
+
### For Large Datasets
|
78
|
+
|
79
|
+
```bash
|
80
|
+
# Increase parallelism
|
81
|
+
export SPARK_SQL_SHUFFLE_PARTITIONS="800"
|
82
|
+
|
83
|
+
# Enable dynamic allocation
|
84
|
+
export SPARK_CONF_spark_dynamicAllocation_enabled="true"
|
85
|
+
export SPARK_CONF_spark_dynamicAllocation_minExecutors="2"
|
86
|
+
export SPARK_CONF_spark_dynamicAllocation_maxExecutors="20"
|
87
|
+
```
|
88
|
+
|
89
|
+
### Memory Optimization
|
90
|
+
|
91
|
+
```bash
|
92
|
+
# Tune garbage collection
|
93
|
+
export SPARK_CONF_spark_executor_extraJavaOptions="-XX:+UseG1GC -XX:+PrintGCDetails"
|
94
|
+
|
95
|
+
# Optimize serialization
|
96
|
+
export SPARK_CONF_spark_serializer_objectStreamReset="100"
|
97
|
+
```
|
98
|
+
|
99
|
+
## Troubleshooting
|
100
|
+
|
101
|
+
### Common Issues
|
102
|
+
|
103
|
+
**Out of Memory Errors:**
|
104
|
+
```bash
|
105
|
+
# Increase driver memory
|
106
|
+
export SPARK_DRIVER_MEMORY="20g"
|
107
|
+
|
108
|
+
# Increase executor memory
|
109
|
+
export SPARK_EXECUTOR_MEMORY="16g"
|
110
|
+
```
|
111
|
+
|
112
|
+
**Slow Performance:**
|
113
|
+
```bash
|
114
|
+
# Increase parallelism
|
115
|
+
export SPARK_WORKER_CORES="32"
|
116
|
+
|
117
|
+
# Enable adaptive query execution
|
118
|
+
export SPARK_CONF_spark_sql_adaptive_enabled="true"
|
119
|
+
```
|
@@ -0,0 +1,109 @@
|
|
1
|
+
# CEHR-GPT Feature Representation using Linear Probing
|
2
|
+
|
3
|
+
This guide covers the process of extracting meaningful patient embeddings from healthcare sequences using **linear probing** techniques for downstream prediction tasks such as disease prediction, patient clustering, and risk stratification.
|
4
|
+
|
5
|
+
## Prerequisites
|
6
|
+
|
7
|
+
Ensure you have:
|
8
|
+
|
9
|
+
1. **Trained CEHR-GPT Model**: Pre-trained model available at `$CEHR_GPT_MODEL_DIR`
|
10
|
+
2. **OMOP Data**: Healthcare data processed and ready for feature extraction
|
11
|
+
3. **Environment Setup**: Required environment variables configured
|
12
|
+
|
13
|
+
## Required Environment Variables
|
14
|
+
|
15
|
+
Set up the necessary directory paths:
|
16
|
+
|
17
|
+
```bash
|
18
|
+
# CEHR-GPT installation directory (auto-detect from git repository)
|
19
|
+
export CEHRGPT_HOME=$(git rev-parse --show-toplevel)
|
20
|
+
export CEHR_GPT_MODEL_DIR="/path/to/trained/model"
|
21
|
+
|
22
|
+
# Data directories
|
23
|
+
export OMOP_DIR="/path/to/omop/data"
|
24
|
+
export CEHR_GPT_DATA_DIR="/path/to/processed/data"
|
25
|
+
export CEHRGPT_FEATURES_DIR="/path/to/extracted/features"
|
26
|
+
```
|
27
|
+
|
28
|
+
## Step 1: Generate Prediction Labels
|
29
|
+
|
30
|
+
Create heart failure readmission labels compatible with MEDS schema for downstream prediction tasks:
|
31
|
+
|
32
|
+
```bash
|
33
|
+
python -u -m cehrbert_data.prediction_cohorts.hf_readmission \
|
34
|
+
-c hf_readmission -i $OMOP_DIR -o $OMOP_DIR/labels \
|
35
|
+
-dl 1985-01-01 -du 2023-12-31 \
|
36
|
+
-l 18 -u 100 -ow 730 -ps 1 -pw 30 \
|
37
|
+
--is_new_patient_representation \
|
38
|
+
--should_construct_artificial_visits \
|
39
|
+
--include_concept_list \
|
40
|
+
--is_remove_index_prediction_starts \
|
41
|
+
--meds_format \
|
42
|
+
--exclude_features
|
43
|
+
```
|
44
|
+
|
45
|
+
### Parameter Explanation
|
46
|
+
|
47
|
+
- `-c hf_readmission`: Cohort name for heart failure readmission prediction
|
48
|
+
- `-i $OMOP_DIR`: Input directory containing OMOP data
|
49
|
+
- `-o $OMOP_DIR/labels`: Output directory for generated labels
|
50
|
+
- `-dl/-du`: Date range for patient inclusion (1985-2023)
|
51
|
+
- `-l 18 -u 100`: Age limits (18-100 years)
|
52
|
+
- `-ow 730`: Observation window in days (2 years)
|
53
|
+
- `-ps 1 -pw 30`: Prediction start (1 day) and window (30 days)
|
54
|
+
- `--is_remove_index_prediction_starts`: Remove cases where outcome events occur before prediction start date
|
55
|
+
- `--include_concept_list`: Include only concepts that are allowed in the model vocabulary
|
56
|
+
- `--meds_format`: Output in MEDS-compatible format
|
57
|
+
|
58
|
+
|
59
|
+
|
60
|
+
## Step 2: Extract Patient Features
|
61
|
+
|
62
|
+
Extract patient sequences using a 2-year observation window, focusing on key clinical events:
|
63
|
+
|
64
|
+
```bash
|
65
|
+
sh $CEHRGPT_HOME/scripts/extract_features_gpt.sh \
|
66
|
+
--cohort-folder $OMOP_DIR/labels \
|
67
|
+
--input-dir $OMOP_DIR \
|
68
|
+
--output-dir "$CEHR_GPT_DATA_DIR/phenotype_cehrgpt_sequences" \
|
69
|
+
--patient-splits-folder "$OMOP_DIR/patient_splits" \
|
70
|
+
--ehr-tables "condition_occurrence procedure_occurrence drug_exposure" \
|
71
|
+
--observation-window 730
|
72
|
+
```
|
73
|
+
> **Tip**: This step requires pyspark, and please refer to **Spark Environment**: Configured Apache Spark (see [Spark Setup README](./spark_setup.md))
|
74
|
+
|
75
|
+
### Key Parameters
|
76
|
+
|
77
|
+
- `--cohort-folder`: Directory containing prediction labels
|
78
|
+
- `--input-dir`: Source OMOP data directory
|
79
|
+
- `--output-dir`: Output directory for extracted sequences
|
80
|
+
- `--patient-splits-folder`: Pre-defined train/validation/test splits
|
81
|
+
- `--ehr-tables`: Clinical tables to include in feature extraction
|
82
|
+
- `--observation-window`: Observation period in days (730 = 2 years)
|
83
|
+
|
84
|
+
|
85
|
+
|
86
|
+
## Step 3: Run Feature Extraction and Linear Probing
|
87
|
+
|
88
|
+
Execute CEHR-GPT feature extraction and train a linear regression model on the extracted patient representations:
|
89
|
+
|
90
|
+
```bash
|
91
|
+
sh $CEHRGPT_HOME/run_cehrgpt.sh \
|
92
|
+
--base_dir="$CEHR_GPT_DATA_DIR/phenotype_cehrgpt_sequences" \
|
93
|
+
--dataset_prepared_path="$CEHR_GPT_DATA_DIR/dataset_prepared" \
|
94
|
+
--model_path=$CEHR_GPT_MODEL_DIR \
|
95
|
+
--output_dir=$CEHRGPT_FEATURES_DIR \
|
96
|
+
--preprocessing_workers=8 \
|
97
|
+
--model_name="cehrgpt"
|
98
|
+
```
|
99
|
+
|
100
|
+
This step performs both feature extraction from patient sequences and trains a linear regression model on the extracted patient representations for downstream prediction tasks.
|
101
|
+
|
102
|
+
### Parameter Details
|
103
|
+
|
104
|
+
- `--base_dir`: Directory containing prepared patient sequences
|
105
|
+
- `--dataset_prepared_path`: Path for preprocessed datasets
|
106
|
+
- `--model_path`: Location of trained CEHR-GPT model
|
107
|
+
- `--output_dir`: Output directory for extracted features and embeddings
|
108
|
+
- `--preprocessing_workers`: Number of parallel workers for data preprocessing
|
109
|
+
- `--model_name`: Model identifier for feature extraction
|
Binary file
|