cehrgpt 0.1.1__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. cehrgpt-0.1.3/PKG-INFO +238 -0
  2. cehrgpt-0.1.3/README.md +203 -0
  3. cehrgpt-0.1.3/constraints.txt +1 -0
  4. cehrgpt-0.1.3/data_generation.md +119 -0
  5. cehrgpt-0.1.3/feature_representation.md +109 -0
  6. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/pyproject.toml +1 -1
  7. cehrgpt-0.1.3/sample_configs/credential_file_sample.ini +5 -0
  8. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/sample_data/pretrain/patient_sequence.parquet +0 -0
  9. cehrgpt-0.1.3/scripts/create_cehrgpt_pretraining_data.sh +168 -0
  10. cehrgpt-0.1.3/scripts/extract_features_gpt.sh +231 -0
  11. cehrgpt-0.1.3/scripts/level_three_evaluation.sh +524 -0
  12. cehrgpt-0.1.3/scripts/omop_pipeline.sh +297 -0
  13. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/scripts/run_linear_prob.sh +8 -1
  14. cehrgpt-0.1.3/spark_setup.md +103 -0
  15. cehrgpt-0.1.3/src/cehrgpt/analysis/htn_treatment_pathway.py +546 -0
  16. cehrgpt-0.1.3/src/cehrgpt/analysis/treatment_pathway/depression_treatment_pathway.py +94 -0
  17. cehrgpt-0.1.3/src/cehrgpt/analysis/treatment_pathway/diabetes_treatment_pathway.py +94 -0
  18. cehrgpt-0.1.3/src/cehrgpt/analysis/treatment_pathway/htn_treatment_pathway.py +94 -0
  19. cehrgpt-0.1.3/src/cehrgpt/analysis/treatment_pathway/treatment_pathway.py +631 -0
  20. cehrgpt-0.1.3/src/cehrgpt/data/cehrgpt_data_processor.py +549 -0
  21. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/data/hf_cehrgpt_dataset.py +4 -0
  22. cehrgpt-0.1.3/src/cehrgpt/data/hf_cehrgpt_dataset_collator.py +677 -0
  23. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/data/hf_cehrgpt_dataset_mapping.py +60 -14
  24. cehrgpt-0.1.3/src/cehrgpt/generation/cehrgpt_conditional_generation.py +316 -0
  25. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/generation/generate_batch_hf_gpt_sequence.py +35 -15
  26. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/generation/omop_converter_batch.py +11 -4
  27. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/gpt_utils.py +73 -3
  28. cehrgpt-0.1.3/src/cehrgpt/models/activations.py +27 -0
  29. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/models/config.py +6 -2
  30. cehrgpt-0.1.3/src/cehrgpt/models/gpt2.py +560 -0
  31. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/models/hf_cehrgpt.py +193 -459
  32. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/models/tokenization_hf_cehrgpt.py +380 -50
  33. cehrgpt-0.1.3/src/cehrgpt/omop/ontology.py +154 -0
  34. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/runners/data_utils.py +17 -6
  35. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/runners/hf_cehrgpt_finetune_runner.py +33 -79
  36. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/runners/hf_cehrgpt_pretrain_runner.py +48 -44
  37. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/runners/hf_gpt_runner_argument_dataclass.py +58 -34
  38. cehrgpt-0.1.3/src/cehrgpt/runners/hyperparameter_search_util.py +336 -0
  39. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/runners/sample_packing_trainer.py +11 -2
  40. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/tools/linear_prob/compute_cehrgpt_features.py +27 -31
  41. cehrgpt-0.1.3/src/cehrgpt.egg-info/PKG-INFO +238 -0
  42. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt.egg-info/SOURCES.txt +22 -1
  43. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt.egg-info/requires.txt +1 -1
  44. cehrgpt-0.1.3/synthetic_data_generation.md +152 -0
  45. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/tests/integration_tests/runners/hf_cehrgpt_pretrain_runner_test.py +13 -5
  46. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/tests/integration_tests/runners/hf_cehrgpt_pretrain_sample_packing_runner_test.py +7 -5
  47. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/tests/integration_tests/runners/hf_cehrgpt_pretrain_sfm_runner_test.py +2 -0
  48. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/tests/unit_tests/models/model_utils_test.py +1 -1
  49. cehrgpt-0.1.3/tests/unit_tests/models/rotary_embedding_test.py +88 -0
  50. cehrgpt-0.1.3/tests/unit_tests/tools/__init__.py +0 -0
  51. cehrgpt-0.1.3/zero_shot_prediction.md +54 -0
  52. cehrgpt-0.1.1/PKG-INFO +0 -115
  53. cehrgpt-0.1.1/README.md +0 -80
  54. cehrgpt-0.1.1/scripts/level_three_evaluation.sh +0 -169
  55. cehrgpt-0.1.1/scripts/omop_pipeline.sh +0 -55
  56. cehrgpt-0.1.1/src/cehrgpt/data/hf_cehrgpt_dataset_collator.py +0 -1020
  57. cehrgpt-0.1.1/src/cehrgpt/runners/hyperparameter_search_util.py +0 -225
  58. cehrgpt-0.1.1/src/cehrgpt.egg-info/PKG-INFO +0 -115
  59. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/.github/workflows/build-python.yaml +0 -0
  60. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/.github/workflows/tests.yaml +0 -0
  61. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/.gitignore +0 -0
  62. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/.pre-commit-config.yaml +0 -0
  63. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/LICENSE +0 -0
  64. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/sample_configs/cehrgpt_pretrain_sample_config.yaml +0 -0
  65. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/sample_data/omop_vocab/concept/concept.parquet +0 -0
  66. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/sample_data/pretrained_embeddings/pretrained_embedding_concepts.pkl +0 -0
  67. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/sample_data/pretrained_embeddings/pretrained_embedding_vectors.npy +0 -0
  68. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/scripts/pool_generated_sequences.sh +0 -0
  69. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/setup.cfg +0 -0
  70. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/__init__.py +0 -0
  71. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/__init__.py +0 -0
  72. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/analysis/__init__.py +0 -0
  73. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/analysis/irregularity.py +0 -0
  74. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/analysis/privacy/__init__.py +0 -0
  75. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/analysis/privacy/attribute_inference.py +0 -0
  76. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/analysis/privacy/attribute_inference_config.yml +0 -0
  77. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/analysis/privacy/member_inference.py +0 -0
  78. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/analysis/privacy/nearest_neighbor_inference.py +0 -0
  79. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/analysis/privacy/reid_inference.py +0 -0
  80. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/analysis/privacy/utils.py +0 -0
  81. {cehrgpt-0.1.1/src/cehrgpt/data → cehrgpt-0.1.3/src/cehrgpt/analysis/treatment_pathway}/__init__.py +0 -0
  82. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/cehrgpt_args.py +0 -0
  83. {cehrgpt-0.1.1/src/cehrgpt/generation → cehrgpt-0.1.3/src/cehrgpt/data}/__init__.py +0 -0
  84. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/data/sample_packing_sampler.py +0 -0
  85. {cehrgpt-0.1.1/src/cehrgpt/models → cehrgpt-0.1.3/src/cehrgpt/generation}/__init__.py +0 -0
  86. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/generation/chatgpt_generation.py +0 -0
  87. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/generation/omop_entity.py +0 -0
  88. {cehrgpt-0.1.1/src/cehrgpt/omop → cehrgpt-0.1.3/src/cehrgpt/models}/__init__.py +0 -0
  89. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/models/hf_modeling_outputs.py +0 -0
  90. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/models/pretrained_embeddings.py +0 -0
  91. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/models/special_tokens.py +0 -0
  92. {cehrgpt-0.1.1/src/cehrgpt/omop/queries → cehrgpt-0.1.3/src/cehrgpt/omop}/__init__.py +0 -0
  93. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/omop/condition_era.py +0 -0
  94. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/omop/observation_period.py +0 -0
  95. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/omop/omop_argparse.py +0 -0
  96. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/omop/omop_table_builder.py +0 -0
  97. {cehrgpt-0.1.1/src/cehrgpt/runners → cehrgpt-0.1.3/src/cehrgpt/omop/queries}/__init__.py +0 -0
  98. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/omop/queries/condition_era.py +0 -0
  99. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/omop/queries/observation_period.py +0 -0
  100. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/omop/sample_omop_tables.py +0 -0
  101. {cehrgpt-0.1.1/src/cehrgpt/simulations → cehrgpt-0.1.3/src/cehrgpt/runners}/__init__.py +0 -0
  102. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/runners/gpt_runner_util.py +0 -0
  103. {cehrgpt-0.1.1/src/cehrgpt/time_to_event → cehrgpt-0.1.3/src/cehrgpt/simulations}/__init__.py +0 -0
  104. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/simulations/generate_plots.py +0 -0
  105. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/simulations/run_simulation.sh +0 -0
  106. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/simulations/time_embedding_simulation.py +0 -0
  107. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/simulations/time_token_simulation.py +0 -0
  108. {cehrgpt-0.1.1/src/cehrgpt/tools → cehrgpt-0.1.3/src/cehrgpt/time_to_event}/__init__.py +0 -0
  109. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/time_to_event/config/1_year_cabg.yaml +0 -0
  110. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/time_to_event/config/30_day_readmission.yaml +0 -0
  111. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/time_to_event/config/next_visit_type_prediction.yaml +0 -0
  112. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/time_to_event/config/t2dm_hf.yaml +0 -0
  113. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/time_to_event/time_to_event_model.py +0 -0
  114. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/time_to_event/time_to_event_prediction.py +0 -0
  115. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/time_to_event/time_to_event_utils.py +0 -0
  116. {cehrgpt-0.1.1/src/cehrgpt/tools/linear_prob → cehrgpt-0.1.3/src/cehrgpt/tools}/__init__.py +0 -0
  117. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/tools/ehrshot_benchmark.py +0 -0
  118. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/tools/generate_causal_patient_split_by_age.py +0 -0
  119. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/tools/generate_pretrained_embeddings.py +0 -0
  120. {cehrgpt-0.1.1/tests → cehrgpt-0.1.3/src/cehrgpt/tools/linear_prob}/__init__.py +0 -0
  121. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/tools/linear_prob/train_with_cehrgpt_features.py +0 -0
  122. /cehrgpt-0.1.1/src/cehrgpt/tools/merge_synthetic_real_dataasets.py → /cehrgpt-0.1.3/src/cehrgpt/tools/merge_synthetic_real_datasets.py +0 -0
  123. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt/tools/upload_omop_tables.py +0 -0
  124. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt.egg-info/dependency_links.txt +0 -0
  125. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/src/cehrgpt.egg-info/top_level.txt +0 -0
  126. {cehrgpt-0.1.1/tests/integration_tests → cehrgpt-0.1.3/tests}/__init__.py +0 -0
  127. {cehrgpt-0.1.1/tests/integration_tests/runners → cehrgpt-0.1.3/tests/integration_tests}/__init__.py +0 -0
  128. {cehrgpt-0.1.1/tests/unit_tests → cehrgpt-0.1.3/tests/integration_tests/runners}/__init__.py +0 -0
  129. {cehrgpt-0.1.1/tests/unit_tests/models → cehrgpt-0.1.3/tests/unit_tests}/__init__.py +0 -0
  130. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/tests/unit_tests/gpt_utils_test.py +0 -0
  131. {cehrgpt-0.1.1/tests/unit_tests/models/tokenization → cehrgpt-0.1.3/tests/unit_tests/models}/__init__.py +0 -0
  132. {cehrgpt-0.1.1/tests/unit_tests/runners → cehrgpt-0.1.3/tests/unit_tests/models/tokenization}/__init__.py +0 -0
  133. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/tests/unit_tests/models/tokenization/create_bins_with_spline_test.py +0 -0
  134. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/tests/unit_tests/models/tokenization/create_sample_from_bins_test.py +0 -0
  135. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/tests/unit_tests/numeric_concept_statistics_test.py +0 -0
  136. {cehrgpt-0.1.1/tests/unit_tests/tools → cehrgpt-0.1.3/tests/unit_tests/runners}/__init__.py +0 -0
  137. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/tests/unit_tests/runners/hf_cehrgpt_finetune_runner_test.py +0 -0
  138. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/tests/unit_tests/tokenization_test.py +0 -0
  139. {cehrgpt-0.1.1 → cehrgpt-0.1.3}/tests/unit_tests/tools/upload_omop_tables_test.py +0 -0
cehrgpt-0.1.3/PKG-INFO ADDED
@@ -0,0 +1,238 @@
1
+ Metadata-Version: 2.4
2
+ Name: cehrgpt
3
+ Version: 0.1.3
4
+ Summary: CEHR-GPT: Generating Electronic Health Records with Chronological Patient Timelines
5
+ Author-email: Chao Pang <chaopang229@gmail.com>, Xinzhuo Jiang <xj2193@cumc.columbia.edu>, Krishna Kalluri <kk3326@cumc.columbia.edu>, Elise Minto <em3697@cumc.columbia.edu>, Jason Patterson <jp3477@cumc.columbia.edu>, Nishanth Parameshwar Pavinkurve <np2689@cumc.columbia.edu>, Karthik Natarajan <kn2174@cumc.columbia.edu>
6
+ License: MIT License
7
+ Classifier: Development Status :: 5 - Production/Stable
8
+ Classifier: Intended Audience :: Developers
9
+ Classifier: Intended Audience :: Science/Research
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Requires-Python: >=3.10.0
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Requires-Dist: cehrbert>=1.4.8
16
+ Requires-Dist: cehrbert_data==0.0.11
17
+ Requires-Dist: openai==1.54.3
18
+ Requires-Dist: optuna==4.0.0
19
+ Requires-Dist: transformers==4.44.1
20
+ Requires-Dist: tokenizers==0.19.0
21
+ Requires-Dist: peft==0.10.0
22
+ Requires-Dist: lightgbm
23
+ Requires-Dist: polars
24
+ Provides-Extra: dev
25
+ Requires-Dist: pre-commit; extra == "dev"
26
+ Requires-Dist: pytest; extra == "dev"
27
+ Requires-Dist: pytest-cov; extra == "dev"
28
+ Requires-Dist: pytest-subtests; extra == "dev"
29
+ Requires-Dist: rootutils; extra == "dev"
30
+ Requires-Dist: hypothesis; extra == "dev"
31
+ Requires-Dist: black; extra == "dev"
32
+ Provides-Extra: flash-attn
33
+ Requires-Dist: flash_attn; extra == "flash-attn"
34
+ Dynamic: license-file
35
+
36
+ # CEHRGPT
37
+
38
+ [![PyPI - Version](https://img.shields.io/pypi/v/cehrgpt)](https://pypi.org/project/cehrgpt/)
39
+ ![Python](https://img.shields.io/badge/-Python_3.11-blue?logo=python&logoColor=white)
40
+ [![tests](https://github.com/knatarajan-lab/cehrgpt/actions/workflows/tests.yaml/badge.svg)](https://github.com/knatarajan-lab/cehrgpt/actions/workflows/tests.yaml)
41
+ [![license](https://img.shields.io/badge/License-MIT-green.svg?labelColor=gray)](https://github.com/knatarajan-lab/cehrgpt/blob/main/LICENSE)
42
+ [![contributors](https://img.shields.io/github/contributors/knatarajan-lab/cehrgpt.svg)](https://github.com/knatarajan-lab/cehrgpt/graphs/contributors)
43
+
44
+ CEHRGPT is a multi-task foundation model for structured electronic health records (EHR) data that supports three capabilities: feature representation, zero-shot prediction, and synthetic data generation.
45
+
46
+ ## 🎯 Key Capabilities
47
+
48
+ ### Feature Representation
49
+ Extract meaningful patient embeddings from sequences of medical events using **linear probing** techniques for downstream tasks such as disease prediction, patient clustering, and risk stratification.
50
+
51
+ ### Zero-Shot Prediction
52
+ Generate outcome predictions directly from prompts without requiring task-specific training, enabling rapid evaluation in low-label clinical settings.
53
+
54
+ ### Synthetic Data Generation
55
+ Generate comprehensive patient profiles including demographics, medical history, treatment courses, and outcomes while implementing advanced privacy-preserving techniques to ensure generated data contains no identifiable information.
56
+ The platform is fully compatible with the OMOP Common Data Model for seamless integration with existing healthcare systems.
57
+ ## 🚀 Installation
58
+
59
+ Clone the repository and install dependencies:
60
+
61
+ ```bash
62
+ git clone https://github.com/knatarajan-lab/cehrgpt.git
63
+ cd cehrgpt
64
+ pip install .
65
+ ```
66
+
67
+ ## 📋 Prerequisites
68
+
69
+ Before getting started, set up the required environment variables:
70
+
71
+ ```bash
72
+ export CEHRGPT_HOME=$(git rev-parse --show-toplevel)
73
+ export OMOP_DIR="" # Path to your OMOP data
74
+ export CEHR_GPT_DATA_DIR="" # Path for processed data storage
75
+ export CEHR_GPT_MODEL_DIR="" # Path for model storage
76
+ ```
77
+
78
+ Create the dataset cache directory:
79
+ ```bash
80
+ mkdir $CEHR_GPT_DATA_DIR/dataset_prepared
81
+ ```
82
+
83
+ ## 🏗️ Model Training
84
+
85
+ ### Step 1: Generate Pre-training Data from OMOP
86
+
87
+ Generate the training data following the [Data Generation Instruction](./data_generation.md).
88
+
89
+ ### Step 2: Pre-train CEHR-GPT
90
+
91
+ Train the foundation model:
92
+
93
+ ```bash
94
+ python -u -m cehrgpt.runners.hf_cehrgpt_pretrain_runner \
95
+ --model_name_or_path $CEHR_GPT_MODEL_DIR \
96
+ --tokenizer_name_or_path $CEHR_GPT_MODEL_DIR \
97
+ --output_dir $CEHR_GPT_MODEL_DIR \
98
+ --data_folder "$CEHR_GPT_DATA_DIR/patient_sequence/train" \
99
+ --dataset_prepared_path "$CEHR_GPT_DATA_DIR/dataset_prepared" \
100
+ --do_train true --seed 42 \
101
+ --dataloader_num_workers 16 --dataloader_prefetch_factor 8 \
102
+ --hidden_size 768 --num_hidden_layers 14 --max_position_embeddings 4096 \
103
+ --evaluation_strategy epoch --save_strategy epoch \
104
+ --sample_packing --max_tokens_per_batch 16384 \
105
+ --warmup_ratio 0.01 --weight_decay 0.01 \
106
+ --num_train_epochs 50 --learning_rate 0.0002 \
107
+ --use_early_stopping --early_stopping_threshold 0.001
108
+ ```
109
+
110
+ > **Tip**: Increase `max_position_embeddings` for longer context windows based on your use case.
111
+
112
+ ## 🎯 Feature Representation
113
+
114
+ CEHR-GPT enables extraction of meaningful patient embeddings from medical event sequences using **linear probing** techniques for downstream prediction tasks. The feature representation pipeline includes label generation, patient sequence extraction, and linear regression model training on the extracted representations.
115
+
116
+ For detailed instructions including cohort creation, patient feature extraction, and linear probing evaluation, please follow the [Feature Representation Guide](./feature_representation.md).
117
+
118
+ ## 🔮 Zero-Shot Prediction
119
+
120
+ CEHR-GPT can generate outcome predictions directly from clinical prompts without requiring task-specific training, making it ideal for rapid evaluation in low-label clinical settings. The zero-shot prediction capability performs time-to-event analysis by processing patient sequences and generating risk predictions based on learned medical patterns.
121
+
122
+ For complete setup instructions including label generation, sequence preparation, and prediction execution, please follow the [Zero-Shot Prediction Guide](./zero_shot_prediction.md).
123
+
124
+ ## 🧬 Synthetic Data Generation
125
+
126
+ CEHR-GPT generates comprehensive synthetic patient profiles including demographics, medical history, treatment courses, and outcomes while implementing advanced privacy-preserving techniques. The synthetic data maintains statistical fidelity to real patient populations without containing identifiable information, and outputs are fully compatible with the OMOP Common Data Model.
127
+
128
+ For step-by-step instructions on generating synthetic sequences and converting them to OMOP format, please follow the [Synthetic Data Generation Guide](./synthetic_data_generation.md).
129
+
130
+ ## 📊 MEDS Support
131
+
132
+ CEHR-GPT supports the Medical Event Data Standard (MEDS) format for enhanced interoperability.
133
+
134
+ ### Prerequisites
135
+
136
+ Configure MEDS-specific environment variables:
137
+
138
+ ```bash
139
+ export CEHR_GPT_MODEL_DIR="" # CEHR-GPT model directory
140
+ export MEDS_DIR="" # MEDS data directory
141
+ export MEDS_READER_DIR="" # MEDS reader output directory
142
+ ```
143
+
144
+ ### Step 1: Create MIMIC MEDS Data
145
+
146
+ Transform MIMIC files to MEDS format following the [MEDS_transforms](https://github.com/mmcdermott/MEDS_transforms/) repository instructions.
147
+
148
+ ### Step 2: Prepare MEDS Reader
149
+
150
+ Convert MEDS data for CEHR-GPT compatibility:
151
+
152
+ ```bash
153
+ meds_reader_convert $MEDS_DIR $MEDS_READER_DIR --num_threads 10
154
+ ```
155
+
156
+ ### Step 3: Pre-train with MEDS Data
157
+
158
+ Execute pre-training using MEDS format:
159
+
160
+ ```bash
161
+ python -u -m cehrgpt.runners.hf_cehrgpt_pretrain_runner \
162
+ --model_name_or_path $CEHR_GPT_MODEL_DIR \
163
+ --tokenizer_name_or_path $CEHR_GPT_MODEL_DIR \
164
+ --output_dir $CEHR_GPT_MODEL_DIR \
165
+ --data_folder $MEDS_READER_DIR \
166
+ --dataset_prepared_path "$CEHR_GPT_MODEL_DIR/dataset_prepared" \
167
+ --do_train true --seed 42 \
168
+ --dataloader_num_workers 16 --dataloader_prefetch_factor 8 \
169
+ --hidden_size 768 --num_hidden_layers 14 --max_position_embeddings 8192 \
170
+ --evaluation_strategy epoch --save_strategy epoch \
171
+ --sample_packing --max_tokens_per_batch 16384 \
172
+ --warmup_steps 500 --weight_decay 0.01 \
173
+ --num_train_epochs 50 --learning_rate 0.0002 \
174
+ --use_early_stopping --early_stopping_threshold 0.001 \
175
+ --is_data_in_meds --inpatient_att_function_type day \
176
+ --att_function_type day --include_inpatient_hour_token \
177
+ --include_auxiliary_token --include_demographic_prompt \
178
+ --meds_to_cehrbert_conversion_type "MedsToBertMimic4"
179
+ ```
180
+
181
+ ### Step 4: Generate MEDS Trajectories
182
+
183
+ #### Environment Setup
184
+
185
+ Configure trajectory generation environment:
186
+
187
+ ```bash
188
+ export MEDS_LABEL_COHORT_DIR="" # Cohort labels directory (parquet files)
189
+ export MEDS_TRAJECTORY_DIR="" # Trajectory output directory
190
+ ```
191
+
192
+ #### Generate Synthetic Trajectories
193
+
194
+ Create patient trajectories with the trained model:
195
+
196
+ ```bash
197
+ python -u -m cehrgpt.generation.cehrgpt_conditional_generation \
198
+ --cohort_folder $MEDS_LABEL_COHORT_DIR \
199
+ --data_folder $MEDS_READER_DIR \
200
+ --dataset_prepared_path "$CEHR_GPT_MODEL_DIR/dataset_prepared" \
201
+ --model_name_or_path $CEHR_GPT_MODEL_DIR \
202
+ --tokenizer_name_or_path $CEHR_GPT_MODEL_DIR \
203
+ --output_dir $MEDS_TRAJECTORY_DIR \
204
+ --per_device_eval_batch_size 16 \
205
+ --num_of_trajectories_per_sample 2 \
206
+ --generation_input_length 4096 \
207
+ --generation_max_new_tokens 4096 \
208
+ --is_data_in_meds \
209
+ --att_function_type day --inpatient_att_function_type day \
210
+ --meds_to_cehrbert_conversion_type MedsToBertMimic4 \
211
+ --include_auxiliary_token --include_demographic_prompt \
212
+ --include_inpatient_hour_token
213
+ ```
214
+
215
+ > **Important**: Ensure `generation_input_length` + `generation_max_new_tokens` ≤ `max_position_embeddings` (8192).
216
+
217
+ #### Parameter Reference
218
+
219
+ - `generation_input_length`: Input context length for generation
220
+ - `generation_max_new_tokens`: Maximum new tokens to generate
221
+ - `num_of_trajectories_per_sample`: Number of trajectories per patient sample
222
+
223
+ ## 📖 Citation
224
+
225
+ If you use CEHRGPT in your research, please cite:
226
+
227
+ ```bibtex
228
+ @article{cehrgpt2024,
229
+ title={CEHRGPT: Synthetic Data Generation for Electronic Health Records},
230
+ author={Natarajan, K and others},
231
+ journal={arXiv preprint arXiv:2402.04400},
232
+ year={2024}
233
+ }
234
+ ```
235
+
236
+ ## 📄 License
237
+
238
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
@@ -0,0 +1,203 @@
1
+ # CEHRGPT
2
+
3
+ [![PyPI - Version](https://img.shields.io/pypi/v/cehrgpt)](https://pypi.org/project/cehrgpt/)
4
+ ![Python](https://img.shields.io/badge/-Python_3.11-blue?logo=python&logoColor=white)
5
+ [![tests](https://github.com/knatarajan-lab/cehrgpt/actions/workflows/tests.yaml/badge.svg)](https://github.com/knatarajan-lab/cehrgpt/actions/workflows/tests.yaml)
6
+ [![license](https://img.shields.io/badge/License-MIT-green.svg?labelColor=gray)](https://github.com/knatarajan-lab/cehrgpt/blob/main/LICENSE)
7
+ [![contributors](https://img.shields.io/github/contributors/knatarajan-lab/cehrgpt.svg)](https://github.com/knatarajan-lab/cehrgpt/graphs/contributors)
8
+
9
+ CEHRGPT is a multi-task foundation model for structured electronic health records (EHR) data that supports three capabilities: feature representation, zero-shot prediction, and synthetic data generation.
10
+
11
+ ## 🎯 Key Capabilities
12
+
13
+ ### Feature Representation
14
+ Extract meaningful patient embeddings from sequences of medical events using **linear probing** techniques for downstream tasks such as disease prediction, patient clustering, and risk stratification.
15
+
16
+ ### Zero-Shot Prediction
17
+ Generate outcome predictions directly from prompts without requiring task-specific training, enabling rapid evaluation in low-label clinical settings.
18
+
19
+ ### Synthetic Data Generation
20
+ Generate comprehensive patient profiles including demographics, medical history, treatment courses, and outcomes while implementing advanced privacy-preserving techniques to ensure generated data contains no identifiable information.
21
+ The platform is fully compatible with the OMOP Common Data Model for seamless integration with existing healthcare systems.
22
+ ## 🚀 Installation
23
+
24
+ Clone the repository and install dependencies:
25
+
26
+ ```bash
27
+ git clone https://github.com/knatarajan-lab/cehrgpt.git
28
+ cd cehrgpt
29
+ pip install .
30
+ ```
31
+
32
+ ## 📋 Prerequisites
33
+
34
+ Before getting started, set up the required environment variables:
35
+
36
+ ```bash
37
+ export CEHRGPT_HOME=$(git rev-parse --show-toplevel)
38
+ export OMOP_DIR="" # Path to your OMOP data
39
+ export CEHR_GPT_DATA_DIR="" # Path for processed data storage
40
+ export CEHR_GPT_MODEL_DIR="" # Path for model storage
41
+ ```
42
+
43
+ Create the dataset cache directory:
44
+ ```bash
45
+ mkdir $CEHR_GPT_DATA_DIR/dataset_prepared
46
+ ```
47
+
48
+ ## 🏗️ Model Training
49
+
50
+ ### Step 1: Generate Pre-training Data from OMOP
51
+
52
+ Generate the training data following the [Data Generation Instruction](./data_generation.md).
53
+
54
+ ### Step 2: Pre-train CEHR-GPT
55
+
56
+ Train the foundation model:
57
+
58
+ ```bash
59
+ python -u -m cehrgpt.runners.hf_cehrgpt_pretrain_runner \
60
+ --model_name_or_path $CEHR_GPT_MODEL_DIR \
61
+ --tokenizer_name_or_path $CEHR_GPT_MODEL_DIR \
62
+ --output_dir $CEHR_GPT_MODEL_DIR \
63
+ --data_folder "$CEHR_GPT_DATA_DIR/patient_sequence/train" \
64
+ --dataset_prepared_path "$CEHR_GPT_DATA_DIR/dataset_prepared" \
65
+ --do_train true --seed 42 \
66
+ --dataloader_num_workers 16 --dataloader_prefetch_factor 8 \
67
+ --hidden_size 768 --num_hidden_layers 14 --max_position_embeddings 4096 \
68
+ --evaluation_strategy epoch --save_strategy epoch \
69
+ --sample_packing --max_tokens_per_batch 16384 \
70
+ --warmup_ratio 0.01 --weight_decay 0.01 \
71
+ --num_train_epochs 50 --learning_rate 0.0002 \
72
+ --use_early_stopping --early_stopping_threshold 0.001
73
+ ```
74
+
75
+ > **Tip**: Increase `max_position_embeddings` for longer context windows based on your use case.
76
+
77
+ ## 🎯 Feature Representation
78
+
79
+ CEHR-GPT enables extraction of meaningful patient embeddings from medical event sequences using **linear probing** techniques for downstream prediction tasks. The feature representation pipeline includes label generation, patient sequence extraction, and linear regression model training on the extracted representations.
80
+
81
+ For detailed instructions including cohort creation, patient feature extraction, and linear probing evaluation, please follow the [Feature Representation Guide](./feature_representation.md).
82
+
83
+ ## 🔮 Zero-Shot Prediction
84
+
85
+ CEHR-GPT can generate outcome predictions directly from clinical prompts without requiring task-specific training, making it ideal for rapid evaluation in low-label clinical settings. The zero-shot prediction capability performs time-to-event analysis by processing patient sequences and generating risk predictions based on learned medical patterns.
86
+
87
+ For complete setup instructions including label generation, sequence preparation, and prediction execution, please follow the [Zero-Shot Prediction Guide](./zero_shot_prediction.md).
88
+
89
+ ## 🧬 Synthetic Data Generation
90
+
91
+ CEHR-GPT generates comprehensive synthetic patient profiles including demographics, medical history, treatment courses, and outcomes while implementing advanced privacy-preserving techniques. The synthetic data maintains statistical fidelity to real patient populations without containing identifiable information, and outputs are fully compatible with the OMOP Common Data Model.
92
+
93
+ For step-by-step instructions on generating synthetic sequences and converting them to OMOP format, please follow the [Synthetic Data Generation Guide](./synthetic_data_generation.md).
94
+
95
+ ## 📊 MEDS Support
96
+
97
+ CEHR-GPT supports the Medical Event Data Standard (MEDS) format for enhanced interoperability.
98
+
99
+ ### Prerequisites
100
+
101
+ Configure MEDS-specific environment variables:
102
+
103
+ ```bash
104
+ export CEHR_GPT_MODEL_DIR="" # CEHR-GPT model directory
105
+ export MEDS_DIR="" # MEDS data directory
106
+ export MEDS_READER_DIR="" # MEDS reader output directory
107
+ ```
108
+
109
+ ### Step 1: Create MIMIC MEDS Data
110
+
111
+ Transform MIMIC files to MEDS format following the [MEDS_transforms](https://github.com/mmcdermott/MEDS_transforms/) repository instructions.
112
+
113
+ ### Step 2: Prepare MEDS Reader
114
+
115
+ Convert MEDS data for CEHR-GPT compatibility:
116
+
117
+ ```bash
118
+ meds_reader_convert $MEDS_DIR $MEDS_READER_DIR --num_threads 10
119
+ ```
120
+
121
+ ### Step 3: Pre-train with MEDS Data
122
+
123
+ Execute pre-training using MEDS format:
124
+
125
+ ```bash
126
+ python -u -m cehrgpt.runners.hf_cehrgpt_pretrain_runner \
127
+ --model_name_or_path $CEHR_GPT_MODEL_DIR \
128
+ --tokenizer_name_or_path $CEHR_GPT_MODEL_DIR \
129
+ --output_dir $CEHR_GPT_MODEL_DIR \
130
+ --data_folder $MEDS_READER_DIR \
131
+ --dataset_prepared_path "$CEHR_GPT_MODEL_DIR/dataset_prepared" \
132
+ --do_train true --seed 42 \
133
+ --dataloader_num_workers 16 --dataloader_prefetch_factor 8 \
134
+ --hidden_size 768 --num_hidden_layers 14 --max_position_embeddings 8192 \
135
+ --evaluation_strategy epoch --save_strategy epoch \
136
+ --sample_packing --max_tokens_per_batch 16384 \
137
+ --warmup_steps 500 --weight_decay 0.01 \
138
+ --num_train_epochs 50 --learning_rate 0.0002 \
139
+ --use_early_stopping --early_stopping_threshold 0.001 \
140
+ --is_data_in_meds --inpatient_att_function_type day \
141
+ --att_function_type day --include_inpatient_hour_token \
142
+ --include_auxiliary_token --include_demographic_prompt \
143
+ --meds_to_cehrbert_conversion_type "MedsToBertMimic4"
144
+ ```
145
+
146
+ ### Step 4: Generate MEDS Trajectories
147
+
148
+ #### Environment Setup
149
+
150
+ Configure trajectory generation environment:
151
+
152
+ ```bash
153
+ export MEDS_LABEL_COHORT_DIR="" # Cohort labels directory (parquet files)
154
+ export MEDS_TRAJECTORY_DIR="" # Trajectory output directory
155
+ ```
156
+
157
+ #### Generate Synthetic Trajectories
158
+
159
+ Create patient trajectories with the trained model:
160
+
161
+ ```bash
162
+ python -u -m cehrgpt.generation.cehrgpt_conditional_generation \
163
+ --cohort_folder $MEDS_LABEL_COHORT_DIR \
164
+ --data_folder $MEDS_READER_DIR \
165
+ --dataset_prepared_path "$CEHR_GPT_MODEL_DIR/dataset_prepared" \
166
+ --model_name_or_path $CEHR_GPT_MODEL_DIR \
167
+ --tokenizer_name_or_path $CEHR_GPT_MODEL_DIR \
168
+ --output_dir $MEDS_TRAJECTORY_DIR \
169
+ --per_device_eval_batch_size 16 \
170
+ --num_of_trajectories_per_sample 2 \
171
+ --generation_input_length 4096 \
172
+ --generation_max_new_tokens 4096 \
173
+ --is_data_in_meds \
174
+ --att_function_type day --inpatient_att_function_type day \
175
+ --meds_to_cehrbert_conversion_type MedsToBertMimic4 \
176
+ --include_auxiliary_token --include_demographic_prompt \
177
+ --include_inpatient_hour_token
178
+ ```
179
+
180
+ > **Important**: Ensure `generation_input_length` + `generation_max_new_tokens` ≤ `max_position_embeddings` (8192).
181
+
182
+ #### Parameter Reference
183
+
184
+ - `generation_input_length`: Input context length for generation
185
+ - `generation_max_new_tokens`: Maximum new tokens to generate
186
+ - `num_of_trajectories_per_sample`: Number of trajectories per patient sample
187
+
188
+ ## 📖 Citation
189
+
190
+ If you use CEHRGPT in your research, please cite:
191
+
192
+ ```bibtex
193
+ @article{cehrgpt2024,
194
+ title={CEHRGPT: Synthetic Data Generation for Electronic Health Records},
195
+ author={Natarajan, K and others},
196
+ journal={arXiv preprint arXiv:2402.04400},
197
+ year={2024}
198
+ }
199
+ ```
200
+
201
+ ## 📄 License
202
+
203
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
@@ -0,0 +1 @@
1
+ xformers==0 # blocks installation
@@ -0,0 +1,119 @@
1
+ # CEHR-GPT Data Generation
2
+
3
+ This guide covers the process of generating pre-training data for CEHR-GPT from OMOP-formatted healthcare datasets.
4
+
5
+ ## Prerequisites
6
+
7
+ Before starting data generation, ensure you have:
8
+
9
+ 1. **Spark Environment**: Configured Apache Spark (see [Spark Setup README](./spark_setup.md))
10
+ 2. **OMOP Data**: Healthcare data in OMOP Common Data Model format
11
+ 3. **Environment Variables**: Required paths and directories set up
12
+
13
+ ## Required Environment Variables
14
+
15
+ Set up the necessary directory paths:
16
+
17
+ ```bash
18
+ # CEHR-GPT installation directory
19
+ export CEHRGPT_HOME=$(git rev-parse --show-toplevel)
20
+
21
+ # OMOP input data directory
22
+ export OMOP_DIR="/path/to/omop/data"
23
+
24
+ # Output directory for processed data
25
+ export CEHR_GPT_DATA_DIR="/path/to/output/data"
26
+ ```
27
+
28
+ ## Step 1: Configure Spark for Data Processing
29
+
30
+ Set up Spark environment variables optimized for healthcare data processing:
31
+
32
+ ```bash
33
+ # Worker configuration
34
+ export SPARK_WORKER_INSTANCES="1"
35
+ export SPARK_MASTER="local[16]"
36
+ export SPARK_WORKER_CORES="16"
37
+ export SPARK_EXECUTOR_CORES="4"
38
+
39
+ # Memory configuration
40
+ export SPARK_DRIVER_MEMORY="20g"
41
+ export SPARK_EXECUTOR_MEMORY="20g"
42
+
43
+ export SPARK_SUBMIT_OPTIONS="--master $SPARK_MASTER --driver-memory $SPARK_DRIVER_MEMORY --executor-memory $SPARK_EXECUTOR_MEMORY --executor-cores $SPARK_EXECUTOR_CORES"
44
+ ```
45
+
46
+ ### Configuration Guidelines
47
+
48
+ **Memory Allocation:**
49
+ - **Small datasets (< 1M patients)**: 8GB driver/executor memory
50
+ - **Medium datasets (1-10M patients)**: 12-16GB driver/executor memory
51
+ - **Large datasets (> 10M patients)**: 20-32GB driver/executor memory
52
+
53
+ **Core Allocation:**
54
+ - Adjust `SPARK_WORKER_CORES` based on available CPU cores
55
+ - Keep `SPARK_EXECUTOR_CORES` at 2-4 for optimal performance
56
+ - Reserve 2-4 cores for system processes
57
+
58
+ ## Step 2: Generate Pre-training Data
59
+
60
+ Execute the data generation script:
61
+
62
+ ```bash
63
+ sh $CEHRGPT_HOME/scripts/create_cehrgpt_pretraining_data.sh \
64
+ --input_folder $OMOP_DIR \
65
+ --output_folder $CEHR_GPT_DATA_DIR \
66
+ --start_date "1985-01-01"
67
+ ```
68
+
69
+ ### Script Parameters
70
+
71
+ - `--input_folder`: Directory containing OMOP-formatted data files
72
+ - `--output_folder`: Directory where processed data will be saved
73
+ - `--start_date`: Earliest date for including patient records (format: YYYY-MM-DD)
74
+
75
+ ## Performance Optimization
76
+
77
+ ### For Large Datasets
78
+
79
+ ```bash
80
+ # Increase parallelism
81
+ export SPARK_SQL_SHUFFLE_PARTITIONS="800"
82
+
83
+ # Enable dynamic allocation
84
+ export SPARK_CONF_spark_dynamicAllocation_enabled="true"
85
+ export SPARK_CONF_spark_dynamicAllocation_minExecutors="2"
86
+ export SPARK_CONF_spark_dynamicAllocation_maxExecutors="20"
87
+ ```
88
+
89
+ ### Memory Optimization
90
+
91
+ ```bash
92
+ # Tune garbage collection
93
+ export SPARK_CONF_spark_executor_extraJavaOptions="-XX:+UseG1GC -XX:+PrintGCDetails"
94
+
95
+ # Optimize serialization
96
+ export SPARK_CONF_spark_serializer_objectStreamReset="100"
97
+ ```
98
+
99
+ ## Troubleshooting
100
+
101
+ ### Common Issues
102
+
103
+ **Out of Memory Errors:**
104
+ ```bash
105
+ # Increase driver memory
106
+ export SPARK_DRIVER_MEMORY="20g"
107
+
108
+ # Increase executor memory
109
+ export SPARK_EXECUTOR_MEMORY="16g"
110
+ ```
111
+
112
+ **Slow Performance:**
113
+ ```bash
114
+ # Increase parallelism
115
+ export SPARK_WORKER_CORES="32"
116
+
117
+ # Enable adaptive query execution
118
+ export SPARK_CONF_spark_sql_adaptive_enabled="true"
119
+ ```
@@ -0,0 +1,109 @@
1
+ # CEHR-GPT Feature Representation using Linear Probing
2
+
3
+ This guide covers the process of extracting meaningful patient embeddings from healthcare sequences using **linear probing** techniques for downstream prediction tasks such as disease prediction, patient clustering, and risk stratification.
4
+
5
+ ## Prerequisites
6
+
7
+ Ensure you have:
8
+
9
+ 1. **Trained CEHR-GPT Model**: Pre-trained model available at `$CEHR_GPT_MODEL_DIR`
10
+ 2. **OMOP Data**: Healthcare data processed and ready for feature extraction
11
+ 3. **Environment Setup**: Required environment variables configured
12
+
13
+ ## Required Environment Variables
14
+
15
+ Set up the necessary directory paths:
16
+
17
+ ```bash
18
+ # CEHR-GPT installation directory (auto-detect from git repository)
19
+ export CEHRGPT_HOME=$(git rev-parse --show-toplevel)
20
+ export CEHR_GPT_MODEL_DIR="/path/to/trained/model"
21
+
22
+ # Data directories
23
+ export OMOP_DIR="/path/to/omop/data"
24
+ export CEHR_GPT_DATA_DIR="/path/to/processed/data"
25
+ export CEHRGPT_FEATURES_DIR="/path/to/extracted/features"
26
+ ```
27
+
28
+ ## Step 1: Generate Prediction Labels
29
+
30
+ Create heart failure readmission labels compatible with MEDS schema for downstream prediction tasks:
31
+
32
+ ```bash
33
+ python -u -m cehrbert_data.prediction_cohorts.hf_readmission \
34
+ -c hf_readmission -i $OMOP_DIR -o $OMOP_DIR/labels \
35
+ -dl 1985-01-01 -du 2023-12-31 \
36
+ -l 18 -u 100 -ow 730 -ps 1 -pw 30 \
37
+ --is_new_patient_representation \
38
+ --should_construct_artificial_visits \
39
+ --include_concept_list \
40
+ --is_remove_index_prediction_starts \
41
+ --meds_format \
42
+ --exclude_features
43
+ ```
44
+
45
+ ### Parameter Explanation
46
+
47
+ - `-c hf_readmission`: Cohort name for heart failure readmission prediction
48
+ - `-i $OMOP_DIR`: Input directory containing OMOP data
49
+ - `-o $OMOP_DIR/labels`: Output directory for generated labels
50
+ - `-dl/-du`: Date range for patient inclusion (1985-2023)
51
+ - `-l 18 -u 100`: Age limits (18-100 years)
52
+ - `-ow 730`: Observation window in days (2 years)
53
+ - `-ps 1 -pw 30`: Prediction start (1 day) and window (30 days)
54
+ - `--is_remove_index_prediction_starts`: Remove cases where outcome events occur before prediction start date
55
+ - `--include_concept_list`: Include only concepts that are allowed in the model vocabulary
56
+ - `--meds_format`: Output in MEDS-compatible format
57
+
58
+
59
+
60
+ ## Step 2: Extract Patient Features
61
+
62
+ Extract patient sequences using a 2-year observation window, focusing on key clinical events:
63
+
64
+ ```bash
65
+ sh $CEHRGPT_HOME/scripts/extract_features_gpt.sh \
66
+ --cohort-folder $OMOP_DIR/labels \
67
+ --input-dir $OMOP_DIR \
68
+ --output-dir "$CEHR_GPT_DATA_DIR/phenotype_cehrgpt_sequences" \
69
+ --patient-splits-folder "$OMOP_DIR/patient_splits" \
70
+ --ehr-tables "condition_occurrence procedure_occurrence drug_exposure" \
71
+ --observation-window 730
72
+ ```
73
+ > **Tip**: This step requires pyspark, and please refer to **Spark Environment**: Configured Apache Spark (see [Spark Setup README](./spark_setup.md))
74
+
75
+ ### Key Parameters
76
+
77
+ - `--cohort-folder`: Directory containing prediction labels
78
+ - `--input-dir`: Source OMOP data directory
79
+ - `--output-dir`: Output directory for extracted sequences
80
+ - `--patient-splits-folder`: Pre-defined train/validation/test splits
81
+ - `--ehr-tables`: Clinical tables to include in feature extraction
82
+ - `--observation-window`: Observation period in days (730 = 2 years)
83
+
84
+
85
+
86
+ ## Step 3: Run Feature Extraction and Linear Probing
87
+
88
+ Execute CEHR-GPT feature extraction and train a linear regression model on the extracted patient representations:
89
+
90
+ ```bash
91
+ sh $CEHRGPT_HOME/run_cehrgpt.sh \
92
+ --base_dir="$CEHR_GPT_DATA_DIR/phenotype_cehrgpt_sequences" \
93
+ --dataset_prepared_path="$CEHR_GPT_DATA_DIR/dataset_prepared" \
94
+ --model_path=$CEHR_GPT_MODEL_DIR \
95
+ --output_dir=$CEHRGPT_FEATURES_DIR \
96
+ --preprocessing_workers=8 \
97
+ --model_name="cehrgpt"
98
+ ```
99
+
100
+ This step performs both feature extraction from patient sequences and trains a linear regression model on the extracted patient representations for downstream prediction tasks.
101
+
102
+ ### Parameter Details
103
+
104
+ - `--base_dir`: Directory containing prepared patient sequences
105
+ - `--dataset_prepared_path`: Path for preprocessed datasets
106
+ - `--model_path`: Location of trained CEHR-GPT model
107
+ - `--output_dir`: Output directory for extracted features and embeddings
108
+ - `--preprocessing_workers`: Number of parallel workers for data preprocessing
109
+ - `--model_name`: Model identifier for feature extraction
@@ -28,7 +28,7 @@ classifiers = [
28
28
  ]
29
29
 
30
30
  dependencies = [
31
- "cehrbert==1.4.5",
31
+ "cehrbert>=1.4.8",
32
32
  "cehrbert_data==0.0.11",
33
33
  "openai==1.54.3",
34
34
  "optuna==4.0.0",
@@ -0,0 +1,5 @@
1
+ [DEFAULT]
2
+ base_url = jdbc:jtds:sqlserver://omop.dbmi.columbia.edu:1433;databaseName=your_database
3
+ driver = net.sourceforge.jtds.jdbc.Driver
4
+ user = username
5
+ password = password