cehrgpt 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. cehrgpt-0.0.1/.github/workflows/build-python.yaml +93 -0
  2. cehrgpt-0.0.1/.github/workflows/tests.yaml +39 -0
  3. cehrgpt-0.0.1/.gitignore +38 -0
  4. cehrgpt-0.0.1/.pre-commit-config.yaml +83 -0
  5. cehrgpt-0.0.1/LICENSE +21 -0
  6. cehrgpt-0.0.1/PKG-INFO +66 -0
  7. cehrgpt-0.0.1/README.md +35 -0
  8. cehrgpt-0.0.1/pyproject.toml +47 -0
  9. cehrgpt-0.0.1/sample_data/pretrain/patient_sequence.parquet +0 -0
  10. cehrgpt-0.0.1/sample_data/pretrained_embeddings/pretrained_embedding_concepts.pkl +0 -0
  11. cehrgpt-0.0.1/sample_data/pretrained_embeddings/pretrained_embedding_vectors.npy +0 -0
  12. cehrgpt-0.0.1/scripts/level_three_evaluation.sh +165 -0
  13. cehrgpt-0.0.1/scripts/omop_pipeline.sh +73 -0
  14. cehrgpt-0.0.1/scripts/pool_generated_sequences.sh +16 -0
  15. cehrgpt-0.0.1/setup.cfg +4 -0
  16. cehrgpt-0.0.1/src/__init__.py +0 -0
  17. cehrgpt-0.0.1/src/cehrgpt/__init__.py +0 -0
  18. cehrgpt-0.0.1/src/cehrgpt/analysis/__init__.py +0 -0
  19. cehrgpt-0.0.1/src/cehrgpt/analysis/privacy/__init__.py +0 -0
  20. cehrgpt-0.0.1/src/cehrgpt/analysis/privacy/attribute_inference.py +275 -0
  21. cehrgpt-0.0.1/src/cehrgpt/analysis/privacy/attribute_inference_config.yml +8975 -0
  22. cehrgpt-0.0.1/src/cehrgpt/analysis/privacy/member_inference.py +172 -0
  23. cehrgpt-0.0.1/src/cehrgpt/analysis/privacy/nearest_neighbor_inference.py +189 -0
  24. cehrgpt-0.0.1/src/cehrgpt/analysis/privacy/reid_inference.py +407 -0
  25. cehrgpt-0.0.1/src/cehrgpt/analysis/privacy/utils.py +255 -0
  26. cehrgpt-0.0.1/src/cehrgpt/cehrgpt_args.py +142 -0
  27. cehrgpt-0.0.1/src/cehrgpt/data/__init__.py +0 -0
  28. cehrgpt-0.0.1/src/cehrgpt/data/hf_cehrgpt_dataset.py +80 -0
  29. cehrgpt-0.0.1/src/cehrgpt/data/hf_cehrgpt_dataset_collator.py +482 -0
  30. cehrgpt-0.0.1/src/cehrgpt/data/hf_cehrgpt_dataset_mapping.py +116 -0
  31. cehrgpt-0.0.1/src/cehrgpt/generation/__init__.py +0 -0
  32. cehrgpt-0.0.1/src/cehrgpt/generation/chatgpt_generation.py +106 -0
  33. cehrgpt-0.0.1/src/cehrgpt/generation/generate_batch_hf_gpt_sequence.py +333 -0
  34. cehrgpt-0.0.1/src/cehrgpt/generation/omop_converter_batch.py +644 -0
  35. cehrgpt-0.0.1/src/cehrgpt/generation/omop_entity.py +515 -0
  36. cehrgpt-0.0.1/src/cehrgpt/gpt_utils.py +331 -0
  37. cehrgpt-0.0.1/src/cehrgpt/models/__init__.py +0 -0
  38. cehrgpt-0.0.1/src/cehrgpt/models/config.py +205 -0
  39. cehrgpt-0.0.1/src/cehrgpt/models/hf_cehrgpt.py +1817 -0
  40. cehrgpt-0.0.1/src/cehrgpt/models/hf_modeling_outputs.py +158 -0
  41. cehrgpt-0.0.1/src/cehrgpt/models/pretrained_embeddings.py +82 -0
  42. cehrgpt-0.0.1/src/cehrgpt/models/special_tokens.py +30 -0
  43. cehrgpt-0.0.1/src/cehrgpt/models/tokenization_hf_cehrgpt.py +1077 -0
  44. cehrgpt-0.0.1/src/cehrgpt/omop/__init__.py +0 -0
  45. cehrgpt-0.0.1/src/cehrgpt/omop/condition_era.py +20 -0
  46. cehrgpt-0.0.1/src/cehrgpt/omop/observation_period.py +43 -0
  47. cehrgpt-0.0.1/src/cehrgpt/omop/omop_argparse.py +38 -0
  48. cehrgpt-0.0.1/src/cehrgpt/omop/omop_table_builder.py +86 -0
  49. cehrgpt-0.0.1/src/cehrgpt/omop/queries/__init__.py +0 -0
  50. cehrgpt-0.0.1/src/cehrgpt/omop/queries/condition_era.py +86 -0
  51. cehrgpt-0.0.1/src/cehrgpt/omop/queries/observation_period.py +135 -0
  52. cehrgpt-0.0.1/src/cehrgpt/omop/sample_omop_tables.py +71 -0
  53. cehrgpt-0.0.1/src/cehrgpt/runners/__init__.py +0 -0
  54. cehrgpt-0.0.1/src/cehrgpt/runners/gpt_runner_util.py +99 -0
  55. cehrgpt-0.0.1/src/cehrgpt/runners/hf_cehrgpt_finetune_runner.py +746 -0
  56. cehrgpt-0.0.1/src/cehrgpt/runners/hf_cehrgpt_pretrain_runner.py +370 -0
  57. cehrgpt-0.0.1/src/cehrgpt/runners/hf_gpt_runner_argument_dataclass.py +137 -0
  58. cehrgpt-0.0.1/src/cehrgpt/runners/hyperparameter_search_util.py +223 -0
  59. cehrgpt-0.0.1/src/cehrgpt/time_to_event/__init__.py +0 -0
  60. cehrgpt-0.0.1/src/cehrgpt/time_to_event/config/30_day_readmission.yaml +8 -0
  61. cehrgpt-0.0.1/src/cehrgpt/time_to_event/config/next_visit_type_prediction.yaml +8 -0
  62. cehrgpt-0.0.1/src/cehrgpt/time_to_event/config/t2dm_hf.yaml +8 -0
  63. cehrgpt-0.0.1/src/cehrgpt/time_to_event/time_to_event_model.py +226 -0
  64. cehrgpt-0.0.1/src/cehrgpt/time_to_event/time_to_event_prediction.py +347 -0
  65. cehrgpt-0.0.1/src/cehrgpt/time_to_event/time_to_event_utils.py +55 -0
  66. cehrgpt-0.0.1/src/cehrgpt/tools/__init__.py +0 -0
  67. cehrgpt-0.0.1/src/cehrgpt/tools/ehrshot_benchmark.py +74 -0
  68. cehrgpt-0.0.1/src/cehrgpt/tools/generate_pretrained_embeddings.py +130 -0
  69. cehrgpt-0.0.1/src/cehrgpt/tools/merge_synthetic_real_dataasets.py +218 -0
  70. cehrgpt-0.0.1/src/cehrgpt/tools/upload_omop_tables.py +108 -0
  71. cehrgpt-0.0.1/src/cehrgpt.egg-info/PKG-INFO +66 -0
  72. cehrgpt-0.0.1/src/cehrgpt.egg-info/SOURCES.txt +90 -0
  73. cehrgpt-0.0.1/src/cehrgpt.egg-info/dependency_links.txt +1 -0
  74. cehrgpt-0.0.1/src/cehrgpt.egg-info/requires.txt +18 -0
  75. cehrgpt-0.0.1/src/cehrgpt.egg-info/top_level.txt +2 -0
  76. cehrgpt-0.0.1/tests/__init__.py +0 -0
  77. cehrgpt-0.0.1/tests/integration_tests/__init__.py +0 -0
  78. cehrgpt-0.0.1/tests/integration_tests/runners/__init__.py +0 -0
  79. cehrgpt-0.0.1/tests/integration_tests/runners/hf_cehrgpt_pretrain_runner_test.py +123 -0
  80. cehrgpt-0.0.1/tests/integration_tests/runners/hf_cehrgpt_pretrain_sfm_runner_test.py +103 -0
  81. cehrgpt-0.0.1/tests/unit_tests/__init__.py +0 -0
  82. cehrgpt-0.0.1/tests/unit_tests/gpt_utils_test.py +168 -0
  83. cehrgpt-0.0.1/tests/unit_tests/models/__init__.py +0 -0
  84. cehrgpt-0.0.1/tests/unit_tests/models/tokenization/__init__.py +0 -0
  85. cehrgpt-0.0.1/tests/unit_tests/models/tokenization/create_bins_with_spline_test.py +48 -0
  86. cehrgpt-0.0.1/tests/unit_tests/models/tokenization/create_sample_from_bins_test.py +43 -0
  87. cehrgpt-0.0.1/tests/unit_tests/numeric_concept_statistics_test.py +175 -0
  88. cehrgpt-0.0.1/tests/unit_tests/runners/__init__.py +0 -0
  89. cehrgpt-0.0.1/tests/unit_tests/runners/hf_cehrgpt_finetune_runner_test.py +110 -0
  90. cehrgpt-0.0.1/tests/unit_tests/tokenization_test.py +104 -0
  91. cehrgpt-0.0.1/tests/unit_tests/tools/__init__.py +0 -0
  92. cehrgpt-0.0.1/tests/unit_tests/tools/upload_omop_tables_test.py +86 -0
@@ -0,0 +1,93 @@
1
+ name: Publish Python 🐍 distribution 📦 to PyPI and TestPyPI
2
+
3
+ on: push
4
+
5
+ jobs:
6
+ build:
7
+ name: Build distribution 📦
8
+ runs-on: ubuntu-latest
9
+
10
+ steps:
11
+ - uses: actions/checkout@v4
12
+ - name: Set up Python
13
+ uses: actions/setup-python@v4
14
+ with:
15
+ python-version: "3.x"
16
+ - name: Install pypa/build
17
+ run: >-
18
+ python3 -m
19
+ pip install
20
+ build
21
+ --user
22
+ - name: Build a binary wheel and a source tarball
23
+ run: python3 -m build
24
+ - name: Store the distribution packages
25
+ uses: actions/upload-artifact@v4
26
+ with:
27
+ name: python-package-distributions
28
+ path: dist/
29
+
30
+ publish-to-pypi:
31
+ name: >-
32
+ Publish Python 🐍 distribution 📦 to PyPI
33
+ if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes
34
+ needs:
35
+ - build
36
+ runs-on: ubuntu-latest
37
+ environment:
38
+ name: pypi
39
+ url: https://pypi.org/p/cehrgpt # Replace <package-name> with your PyPI project name
40
+ permissions:
41
+ id-token: write # IMPORTANT: mandatory for trusted publishing
42
+
43
+ steps:
44
+ - name: Download all the dists
45
+ uses: actions/download-artifact@v4
46
+ with:
47
+ name: python-package-distributions
48
+ path: dist/
49
+ - name: Publish distribution 📦 to PyPI
50
+ uses: pypa/gh-action-pypi-publish@release/v1
51
+
52
+ github-release:
53
+ name: >-
54
+ Sign the Python 🐍 distribution 📦 with Sigstore
55
+ and upload them to GitHub Release
56
+ needs:
57
+ - publish-to-pypi
58
+ runs-on: ubuntu-latest
59
+
60
+ permissions:
61
+ contents: write # IMPORTANT: mandatory for making GitHub Releases
62
+ id-token: write # IMPORTANT: mandatory for sigstore
63
+
64
+ steps:
65
+ - name: Download all the dists
66
+ uses: actions/download-artifact@v4
67
+ with:
68
+ name: python-package-distributions
69
+ path: dist/
70
+ - name: Sign the dists with Sigstore
71
+ uses: sigstore/gh-action-sigstore-python@v3.0.0
72
+ with:
73
+ inputs: >-
74
+ ./dist/*.tar.gz
75
+ ./dist/*.whl
76
+ - name: Create GitHub Release
77
+ env:
78
+ GITHUB_TOKEN: ${{ github.token }}
79
+ run: >-
80
+ gh release create
81
+ '${{ github.ref_name }}'
82
+ --repo '${{ github.repository }}'
83
+ --notes ""
84
+ - name: Upload artifact signatures to GitHub Release
85
+ env:
86
+ GITHUB_TOKEN: ${{ github.token }}
87
+ # Upload to GitHub Release using the `gh` CLI.
88
+ # `dist/` contains the built packages, and the
89
+ # sigstore-produced signatures and certificates.
90
+ run: >-
91
+ gh release upload
92
+ '${{ github.ref_name }}' dist/**
93
+ --repo '${{ github.repository }}'
@@ -0,0 +1,39 @@
1
+ # This workflow will install Python dependencies, run tests and lint with a single version of Python
2
+ # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
3
+
4
+ name: Tests
5
+
6
+ on:
7
+ push:
8
+ branches: [ "main" ]
9
+ pull_request:
10
+ branches: [ "main" ]
11
+
12
+ permissions:
13
+ contents: read
14
+
15
+ jobs:
16
+ build:
17
+
18
+ runs-on: ubuntu-latest
19
+
20
+ steps:
21
+ - uses: actions/checkout@v3
22
+ - name: Set up Python 3.10.0
23
+ uses: actions/setup-python@v3
24
+ with:
25
+ python-version: "3.10"
26
+ - name: Install dependencies
27
+ run: |
28
+ python -m pip install --upgrade pip
29
+ pip install flake8 pytest
30
+ pip install -e .
31
+ - name: Lint with flake8
32
+ run: |
33
+ # stop the build if there are Python syntax errors or undefined names
34
+ flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
35
+ # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
36
+ flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
37
+ - name: Test with pytest
38
+ run: |
39
+ PYTHONPATH=./: pytest
@@ -0,0 +1,38 @@
1
+ .DS_Store
2
+ .idea/
3
+ .vscode/
4
+ venv*
5
+ dist/*
6
+
7
+ *ipynb_checkpoints/
8
+ *h5
9
+ *logs
10
+ *nohup.out
11
+ *ipynb
12
+
13
+ *__pycache__/
14
+ .eggs/
15
+ *.dat
16
+ .metastore_db/
17
+
18
+ src/cehr-bert/evaluations/hospitalization_baseline.py
19
+ src/cehr-bert/evaluations/hospitalization_prediction-new-data-generator.py
20
+ hospitalization_baseline.py
21
+ hospitalization_prediction-new-data-generator.py
22
+ build/
23
+ concept_embeddings.egg-info/
24
+
25
+ src/cehr-bert/runner/*yaml
26
+
27
+ *.out
28
+ *.egg-info/
29
+
30
+ cehr_transformers.egg-info/dependency_links.txt
31
+ cehr_transformers.egg-info/PKG-INFO
32
+ cehr_transformers.egg-info/requires.txt
33
+ cehr_transformers.egg-info/SOURCES.txt
34
+ cehr_transformers.egg-info/top_level.txt
35
+
36
+ test_data
37
+ test_dataset_prepared
38
+ test*results
@@ -0,0 +1,83 @@
1
+ # For documentation on pre-commit usage, see https://pre-commit.com/
2
+ # This file should be updated quarterly by a developer running `pre-commit autoupdate`
3
+ # with changes added and committed.
4
+ # This will run all defined formatters prior to adding a commit.
5
+ default_language_version:
6
+ python: python3 # or python3.10 to set a specific default version
7
+
8
+ repos:
9
+ - repo: https://github.com/pre-commit/pre-commit-hooks
10
+ rev: v4.6.0
11
+ hooks:
12
+ - id: check-yaml
13
+ - id: end-of-file-fixer
14
+ - id: trailing-whitespace
15
+
16
+ - repo: https://github.com/DanielNoord/pydocstringformatter
17
+ rev: 'v0.7.3'
18
+ hooks:
19
+ - id: pydocstringformatter
20
+
21
+ - repo: https://github.com/PyCQA/autoflake
22
+ rev: v2.2.0
23
+ hooks:
24
+ - id: autoflake
25
+
26
+ - repo: https://github.com/psf/black
27
+ rev: '24.1.1'
28
+ hooks:
29
+ - id: black
30
+ # It is recommended to specify the latest version of Python
31
+ # supported by your project here, or alternatively use
32
+ # pre-commit's default_language_version, see
33
+ # https://pre-commit.com/#top_level-default_language_version
34
+ # Pre-commit hook info from: https://black.readthedocs.io/en/stable/integrations/source_version_control.html
35
+ # Editor integration here: https://black.readthedocs.io/en/stable/integrations/editors.html
36
+
37
+ - repo: https://github.com/adamchainz/blacken-docs
38
+ rev: "v1.12.1" # replace with latest tag on GitHub
39
+ hooks:
40
+ - id: blacken-docs
41
+ additional_dependencies:
42
+ - black>=22.12.0
43
+
44
+ - repo: https://github.com/pre-commit/pre-commit-hooks
45
+ rev: 'v4.5.0'
46
+ hooks:
47
+ - id: trailing-whitespace
48
+ exclude: .git/COMMIT_EDITMSG
49
+ - id: end-of-file-fixer
50
+ exclude: .git/COMMIT_EDITMSG
51
+ - id: detect-private-key
52
+ - id: debug-statements
53
+ - id: check-json
54
+ - id: pretty-format-json
55
+ - id: check-yaml
56
+ - id: name-tests-test
57
+ - id: requirements-txt-fixer
58
+
59
+ - repo: https://github.com/pre-commit/pygrep-hooks
60
+ rev: 'v1.10.0'
61
+ hooks:
62
+ # - id: python-no-eval
63
+ - id: python-no-log-warn
64
+ - id: python-use-type-annotations
65
+
66
+ - repo: https://github.com/Lucas-C/pre-commit-hooks
67
+ rev: v1.5.4
68
+ hooks:
69
+ - id: remove-crlf
70
+ - id: remove-tabs # defaults to: 4
71
+ exclude: .git/COMMIT_EDITMSG
72
+
73
+ - repo: https://github.com/PyCQA/isort.git
74
+ rev: 5.13.2
75
+ hooks:
76
+ - id: isort
77
+ args: [ "--profile", "black" ]
78
+
79
+ - repo: https://github.com/PyCQA/bandit
80
+ rev: '1.7.7'
81
+ hooks:
82
+ - id: bandit
83
+ args: ["--skip", "B101,B106,B107,B301,B311,B105,B608,B403"]
cehrgpt-0.0.1/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Department of Biomedical Informatics
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
cehrgpt-0.0.1/PKG-INFO ADDED
@@ -0,0 +1,66 @@
1
+ Metadata-Version: 2.2
2
+ Name: cehrgpt
3
+ Version: 0.0.1
4
+ Summary: CEHR-GPT: Generating Electronic Health Records with Chronological Patient Timelines
5
+ Author-email: Chao Pang <chaopang229@gmail.com>, Xinzhuo Jiang <xj2193@cumc.columbia.edu>, Krishna Kalluri <kk3326@cumc.columbia.edu>, Elise Minto <em3697@cumc.columbia.edu>, Jason Patterson <jp3477@cumc.columbia.edu>, Nishanth Parameshwar Pavinkurve <np2689@cumc.columbia.edu>, Karthik Natarajan <kn2174@cumc.columbia.edu>
6
+ License: MIT License
7
+ Classifier: Development Status :: 5 - Production/Stable
8
+ Classifier: Intended Audience :: Developers
9
+ Classifier: Intended Audience :: Science/Research
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Requires-Python: >=3.10.0
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Requires-Dist: cehrbert==1.3.3
16
+ Requires-Dist: openai==1.54.3
17
+ Requires-Dist: optuna==4.0.0
18
+ Requires-Dist: transformers==4.40.0
19
+ Requires-Dist: tokenizers==0.19
20
+ Requires-Dist: trl==0.11.4
21
+ Provides-Extra: dev
22
+ Requires-Dist: pre-commit; extra == "dev"
23
+ Requires-Dist: pytest; extra == "dev"
24
+ Requires-Dist: pytest-cov; extra == "dev"
25
+ Requires-Dist: pytest-subtests; extra == "dev"
26
+ Requires-Dist: rootutils; extra == "dev"
27
+ Requires-Dist: hypothesis; extra == "dev"
28
+ Requires-Dist: black; extra == "dev"
29
+ Provides-Extra: flash-attn
30
+ Requires-Dist: flash_attn; extra == "flash-attn"
31
+
32
+ # CEHRGPT
33
+
34
+ [![PyPI - Version](https://img.shields.io/pypi/v/cehrgpt)](https://pypi.org/project/cehrgpt/)
35
+ ![Python](https://img.shields.io/badge/-Python_3.11-blue?logo=python&logoColor=white)
36
+ [![tests](https://github.com/knatarajan-lab/cehrgpt-public/actions/workflows/tests.yaml/badge.svg)](https://github.com/knatarajan-lab/cehrgpt-public/actions/workflows/tests.yml)
37
+ [![license](https://img.shields.io/badge/License-MIT-green.svg?labelColor=gray)](https://github.com/knatarajan-lab/cehrgpt-public/blob/main/LICENSE)
38
+ [![contributors](https://img.shields.io/github/contributors/knatarajan-lab/cehrgpt-public.svg)](https://github.com/knatarajan-lab/cehrgpt-public/graphs/contributors)
39
+
40
+ ## Description
41
+ CEHRGPT is a synthetic data generation model developed to handle structured electronic health records (EHR) with enhanced privacy and reliability. It leverages state-of-the-art natural language processing techniques to create realistic, anonymized patient data that can be used for research and development without compromising patient privacy.
42
+
43
+ ## Features
44
+ - **Synthetic Patient Data Generation**: Generates comprehensive patient profiles including demographics, medical history, treatment courses, and outcomes.
45
+ - **Privacy-Preserving**: Implements techniques to ensure the generated data does not reveal identifiable information.
46
+ - **Compatibility with OMOP**: Fully compatible with the OMOP common data model, allowing seamless integration with existing healthcare data systems.
47
+ - **Extensible**: Designed to be adaptable to new datasets and different EHR systems.
48
+
49
+ ## Installation
50
+ To install CEHRGPT, clone this repository and install the required dependencies.
51
+
52
+ ```bash
53
+ git clone https://github.com/knatarajan-lab/cehrgpt-public.git
54
+ cd cehrgpt-public
55
+ pip install .
56
+ ```
57
+
58
+ ## Citation
59
+ ```
60
+ @article{cehrgpt2024,
61
+ title={CEHRGPT: Synthetic Data Generation for Electronic Health Records},
62
+ author={Natarajan, K and others},
63
+ journal={arXiv preprint arXiv:2402.04400},
64
+ year={2024}
65
+ }
66
+ ```
@@ -0,0 +1,35 @@
1
+ # CEHRGPT
2
+
3
+ [![PyPI - Version](https://img.shields.io/pypi/v/cehrgpt)](https://pypi.org/project/cehrgpt/)
4
+ ![Python](https://img.shields.io/badge/-Python_3.11-blue?logo=python&logoColor=white)
5
+ [![tests](https://github.com/knatarajan-lab/cehrgpt-public/actions/workflows/tests.yaml/badge.svg)](https://github.com/knatarajan-lab/cehrgpt-public/actions/workflows/tests.yml)
6
+ [![license](https://img.shields.io/badge/License-MIT-green.svg?labelColor=gray)](https://github.com/knatarajan-lab/cehrgpt-public/blob/main/LICENSE)
7
+ [![contributors](https://img.shields.io/github/contributors/knatarajan-lab/cehrgpt-public.svg)](https://github.com/knatarajan-lab/cehrgpt-public/graphs/contributors)
8
+
9
+ ## Description
10
+ CEHRGPT is a synthetic data generation model developed to handle structured electronic health records (EHR) with enhanced privacy and reliability. It leverages state-of-the-art natural language processing techniques to create realistic, anonymized patient data that can be used for research and development without compromising patient privacy.
11
+
12
+ ## Features
13
+ - **Synthetic Patient Data Generation**: Generates comprehensive patient profiles including demographics, medical history, treatment courses, and outcomes.
14
+ - **Privacy-Preserving**: Implements techniques to ensure the generated data does not reveal identifiable information.
15
+ - **Compatibility with OMOP**: Fully compatible with the OMOP common data model, allowing seamless integration with existing healthcare data systems.
16
+ - **Extensible**: Designed to be adaptable to new datasets and different EHR systems.
17
+
18
+ ## Installation
19
+ To install CEHRGPT, clone this repository and install the required dependencies.
20
+
21
+ ```bash
22
+ git clone https://github.com/knatarajan-lab/cehrgpt-public.git
23
+ cd cehrgpt-public
24
+ pip install .
25
+ ```
26
+
27
+ ## Citation
28
+ ```
29
+ @article{cehrgpt2024,
30
+ title={CEHRGPT: Synthetic Data Generation for Electronic Health Records},
31
+ author={Natarajan, K and others},
32
+ journal={arXiv preprint arXiv:2402.04400},
33
+ year={2024}
34
+ }
35
+ ```
@@ -0,0 +1,47 @@
1
+ [build-system]
2
+ requires = ["setuptools", "wheel", "setuptools_scm"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "cehrgpt"
7
+ dynamic = ["version"]
8
+ authors = [
9
+ { name = "Chao Pang", email = "chaopang229@gmail.com" },
10
+ { name = "Xinzhuo Jiang", email = "xj2193@cumc.columbia.edu" },
11
+ { name = "Krishna Kalluri", email = "kk3326@cumc.columbia.edu" },
12
+ { name = "Elise Minto", email = "em3697@cumc.columbia.edu" },
13
+ { name = "Jason Patterson", email = "jp3477@cumc.columbia.edu" },
14
+ { name = "Nishanth Parameshwar Pavinkurve", email = "np2689@cumc.columbia.edu" },
15
+ { name = "Karthik Natarajan", email = "kn2174@cumc.columbia.edu" }
16
+ ]
17
+ description = "CEHR-GPT: Generating Electronic Health Records with Chronological Patient Timelines"
18
+ readme = "README.md"
19
+ license = { text = "MIT License" }
20
+ requires-python = ">=3.10.0"
21
+
22
+ classifiers = [
23
+ "Development Status :: 5 - Production/Stable",
24
+ "Intended Audience :: Developers",
25
+ "Intended Audience :: Science/Research",
26
+ "License :: OSI Approved :: MIT License",
27
+ "Programming Language :: Python :: 3"
28
+ ]
29
+
30
+ dependencies = [
31
+ "cehrbert==1.3.3",
32
+ "openai==1.54.3",
33
+ "optuna==4.0.0",
34
+ "transformers==4.40.0",
35
+ "tokenizers==0.19",
36
+ "trl==0.11.4",
37
+ ]
38
+
39
+ [tool.setuptools_scm]
40
+
41
+ [project.optional-dependencies]
42
+ dev = [
43
+ "pre-commit", "pytest", "pytest-cov", "pytest-subtests", "rootutils", "hypothesis", "black"
44
+ ]
45
+ flash_attn = [
46
+ "flash_attn"
47
+ ]
@@ -0,0 +1,165 @@
1
+ #!/bin/bash
2
+
3
+ # Export environment variables
4
+ export OMOP_FOLDER=$1
5
+ export PATIENT_SPLITS_FOLDER=$2
6
+
7
+ # Echo input variables
8
+ echo "OMOP_FOLDER=$OMOP_FOLDER"
9
+ echo "PATIENT_SPLITS_FOLDER=$PATIENT_SPLITS_FOLDER"
10
+
11
+ # Helper function to check and create directories
12
+ create_directory_if_not_exists() {
13
+ if [ ! -d "$1" ]; then
14
+ echo "Creating $1"
15
+ mkdir -p "$1"
16
+ fi
17
+ }
18
+
19
+ #!/bin/bash
20
+
21
+ # Generate CAD CABG Cohort
22
+ echo "Generating cad_cabg"
23
+ create_directory_if_not_exists "$OMOP_FOLDER/cohorts/cad_cabg"
24
+
25
+ python -u -m cehrbert_data.prediction_cohorts.cad_cabg_cohort \
26
+ -c cad_cabg_bow \
27
+ -i "$OMOP_FOLDER" \
28
+ -o "$OMOP_FOLDER/cohorts/cad_cabg/" \
29
+ -dl 1985-01-01 -du 2023-12-31 \
30
+ -l 18 -u 100 -ow 360 -ps 0 -pw 360 -f \
31
+ --att_type cehr_bert \
32
+ --ehr_table_list condition_occurrence procedure_occurrence drug_exposure -iv
33
+
34
+ # Run Predictions on CAD CABG
35
+ echo "Run predictions on cad_cabg"
36
+ create_directory_if_not_exists "$OMOP_FOLDER/evaluation_gpt/cad_cabg"
37
+
38
+ if [ -n "$PATIENT_SPLITS_FOLDER" ]; then
39
+ python -m cehrbert.evaluations.evaluation \
40
+ -a baseline_model \
41
+ -d "$OMOP_FOLDER/cohorts/cad_cabg/cad_cabg_bow/" \
42
+ -ef "$OMOP_FOLDER/evaluation_gpt/cad_cabg/" \
43
+ --patient_splits_folder "$PATIENT_SPLITS_FOLDER"
44
+ else
45
+ python -m cehrbert.evaluations.evaluation \
46
+ -a baseline_model \
47
+ -d "$OMOP_FOLDER/cohorts/cad_cabg/cad_cabg_bow/" \
48
+ -ef "$OMOP_FOLDER/evaluation_gpt/cad_cabg/"
49
+ fi
50
+
51
+ # Generate HF Readmission
52
+ echo "Generating hf_readmission"
53
+ create_directory_if_not_exists "$OMOP_FOLDER/cohorts/hf_readmission"
54
+
55
+ python -u -m cehrbert_data.prediction_cohorts.hf_readmission \
56
+ -c hf_readmission_bow \
57
+ -i "$OMOP_FOLDER" \
58
+ -o "$OMOP_FOLDER/cohorts/hf_readmission" \
59
+ -dl 1985-01-01 -du 2023-12-31 -l 18 -u 100 -ow 360 -ps 0 -pw 30 -f \
60
+ --att_type cehr_bert \
61
+ --ehr_table_list condition_occurrence procedure_occurrence drug_exposure -iv
62
+
63
+ # Run predictions on HF Readmission
64
+ echo "Run predictions on hf_readmission"
65
+ create_directory_if_not_exists "$OMOP_FOLDER/evaluation_gpt/hf_readmission"
66
+
67
+ if [ -n "$PATIENT_SPLITS_FOLDER" ]; then
68
+ python -m cehrbert.evaluations.evaluation \
69
+ -a baseline_model \
70
+ -d "$OMOP_FOLDER/cohorts/hf_readmission/hf_readmission_bow/" \
71
+ -ef "$OMOP_FOLDER/evaluation_gpt/hf_readmission/" \
72
+ --patient_splits_folder "$PATIENT_SPLITS_FOLDER"
73
+ else
74
+ python -m cehrbert.evaluations.evaluation \
75
+ -a baseline_model \
76
+ -d "$OMOP_FOLDER/cohorts/hf_readmission/hf_readmission_bow/" \
77
+ -ef "$OMOP_FOLDER/evaluation_gpt/hf_readmission/"
78
+ fi
79
+
80
+ # Generate COPD Readmission
81
+ echo "Generating copd_readmission"
82
+ create_directory_if_not_exists "$OMOP_FOLDER/cohorts/copd_readmission"
83
+
84
+ python -u -m cehrbert_data.prediction_cohorts.copd_readmission \
85
+ -c copd_readmission_bow \
86
+ -i "$OMOP_FOLDER" \
87
+ -o "$OMOP_FOLDER/cohorts/copd_readmission" \
88
+ -dl 1985-01-01 -du 2023-12-31 -l 18 -u 100 -ow 720 -ps 0 -pw 360 -f \
89
+ --att_type cehr_bert \
90
+ --ehr_table_list condition_occurrence procedure_occurrence drug_exposure -iv
91
+
92
+ # Run predictions on COPD Readmission
93
+ echo "Run predictions on copd_readmission"
94
+ create_directory_if_not_exists "$OMOP_FOLDER/evaluation_gpt/copd_readmission"
95
+
96
+ if [ -n "$PATIENT_SPLITS_FOLDER" ]; then
97
+ python -m cehrbert.evaluations.evaluation \
98
+ -a baseline_model \
99
+ -d "$OMOP_FOLDER/cohorts/copd_readmission/copd_readmission_bow/" \
100
+ -ef "$OMOP_FOLDER/evaluation_gpt/copd_readmission/" \
101
+ --patient_splits_folder "$PATIENT_SPLITS_FOLDER"
102
+ else
103
+ python -m cehrbert.evaluations.evaluation \
104
+ -a baseline_model \
105
+ -d "$OMOP_FOLDER/cohorts/copd_readmission/copd_readmission_bow/" \
106
+ -ef "$OMOP_FOLDER/evaluation_gpt/copd_readmission/"
107
+ fi
108
+
109
+ # Generate Hospitalization
110
+ echo "Generating hospitalization"
111
+ create_directory_if_not_exists "$OMOP_FOLDER/cohorts/hospitalization"
112
+
113
+ python -u -m cehrbert_data.prediction_cohorts.hospitalization \
114
+ -c hospitalization_bow \
115
+ -i "$OMOP_FOLDER" \
116
+ -o "$OMOP_FOLDER/cohorts/hospitalization" \
117
+ -dl 1985-01-01 -du 2023-12-31 -l 18 -u 100 -ow 540 -hw 180 -ps 0 -pw 360 -f -iw \
118
+ --att_type cehr_bert \
119
+ --ehr_table_list condition_occurrence procedure_occurrence drug_exposure -iv
120
+
121
+ # Run predictions on Hospitalization
122
+ echo "Run predictions on hospitalization"
123
+ create_directory_if_not_exists "$OMOP_FOLDER/evaluation_gpt/hospitalization"
124
+
125
+ if [ -n "$PATIENT_SPLITS_FOLDER" ]; then
126
+ python -m cehrbert.evaluations.evaluation \
127
+ -a baseline_model \
128
+ -d "$OMOP_FOLDER/cohorts/hospitalization/hospitalization_bow/" \
129
+ -ef "$OMOP_FOLDER/evaluation_gpt/hospitalization/" \
130
+ --patient_splits_folder "$PATIENT_SPLITS_FOLDER"
131
+ else
132
+ python -m cehrbert.evaluations.evaluation \
133
+ -a baseline_model \
134
+ -d "$OMOP_FOLDER/cohorts/hospitalization/hospitalization_bow/" \
135
+ -ef "$OMOP_FOLDER/evaluation_gpt/hospitalization/"
136
+ fi
137
+
138
+ # Generate AFIB Ischemic Stroke
139
+ echo "Generating afib_ischemic_stroke"
140
+ create_directory_if_not_exists "$OMOP_FOLDER/cohorts/afib_ischemic_stroke"
141
+
142
+ python -u -m cehrbert_data.prediction_cohorts.afib_ischemic_stroke \
143
+ -c afib_ischemic_stroke_bow \
144
+ -i "$OMOP_FOLDER" \
145
+ -o "$OMOP_FOLDER/cohorts/afib_ischemic_stroke" \
146
+ -dl 1985-01-01 -du 2023-12-31 -l 18 -u 100 -ow 720 -ps 0 -pw 360 -f \
147
+ --att_type cehr_bert \
148
+ --ehr_table_list condition_occurrence procedure_occurrence drug_exposure -iv
149
+
150
+ # Run predictions on AFIB Ischemic Stroke
151
+ echo "Run predictions on afib_ischemic_stroke"
152
+ create_directory_if_not_exists "$OMOP_FOLDER/evaluation_gpt/afib_ischemic_stroke"
153
+
154
+ if [ -n "$PATIENT_SPLITS_FOLDER" ]; then
155
+ python -m cehrbert.evaluations.evaluation \
156
+ -a baseline_model \
157
+ -d "$OMOP_FOLDER/cohorts/afib_ischemic_stroke/afib_ischemic_stroke_bow/" \
158
+ -ef "$OMOP_FOLDER/evaluation_gpt/afib_ischemic_stroke/" \
159
+ --patient_splits_folder "$PATIENT_SPLITS_FOLDER"
160
+ else
161
+ python -m cehrbert.evaluations.evaluation \
162
+ -a baseline_model \
163
+ -d "$OMOP_FOLDER/cohorts/afib_ischemic_stroke/afib_ischemic_stroke_bow/" \
164
+ -ef "$OMOP_FOLDER/evaluation_gpt/afib_ischemic_stroke/"
165
+ fi
@@ -0,0 +1,73 @@
1
+ #!/bin/bash
2
+
3
+ # Exporting input arguments as environment variables
4
+ export PATIENT_SEQUENCE_FOLDER=$1
5
+ export OMOP_FOLDER=$2
6
+ export SOURCE_OMOP_FOLDER=$3
7
+ export PATIENT_SPLITS_FOLDER=$SOURCE_OMOP_FOLDER/patient_splits
8
+
9
+ # Echoing the values of the environment variables
10
+ echo "PATIENT_SEQUENCE_FOLDER=$PATIENT_SEQUENCE_FOLDER"
11
+ echo "OMOP_FOLDER=$OMOP_FOLDER"
12
+ echo "SOURCE_OMOP_FOLDER=$SOURCE_OMOP_FOLDER"
13
+
14
+ # Ensure OMOP_FOLDER exists
15
+ if [ ! -d "$OMOP_FOLDER" ]; then
16
+ echo "Creating $OMOP_FOLDER"
17
+ mkdir -p "$OMOP_FOLDER"
18
+ fi
19
+
20
+ # Removing existing OMOP tables
21
+ rm -rf $OMOP_FOLDER/person/
22
+ rm -rf $OMOP_FOLDER/visit_occurrence/
23
+ rm -rf $OMOP_FOLDER/condition_occurrence/
24
+ rm -rf $OMOP_FOLDER/procedure_occurrence/
25
+ rm -rf $OMOP_FOLDER/drug_exposure/
26
+ rm -rf $OMOP_FOLDER/death/
27
+ rm -rf $OMOP_FOLDER/measurement/
28
+ rm -rf $OMOP_FOLDER/observation_period/
29
+ rm -rf $OMOP_FOLDER/condition_era/
30
+
31
+ # Removing existing OMOP concept tables
32
+ rm -rf $OMOP_FOLDER/concept
33
+ rm -rf $OMOP_FOLDER/concept_ancestor
34
+ rm -rf $OMOP_FOLDER/concept_relationship
35
+
36
+ # Copying OMOP concept tables if they don't already exist
37
+ if [ ! -d $OMOP_FOLDER/concept ]; then
38
+ echo "Creating $OMOP_FOLDER/concept"
39
+ cp -r $SOURCE_OMOP_FOLDER/concept $OMOP_FOLDER/concept
40
+ fi
41
+
42
+ if [ ! -d $OMOP_FOLDER/concept_relationship ]; then
43
+ echo "Creating $OMOP_FOLDER/concept_relationship"
44
+ cp -r $SOURCE_OMOP_FOLDER/concept_relationship $OMOP_FOLDER/concept_relationship
45
+ fi
46
+
47
+ if [ ! -d $OMOP_FOLDER/concept_ancestor ]; then
48
+ echo "Creating $OMOP_FOLDER/concept_ancestor"
49
+ cp -r $SOURCE_OMOP_FOLDER/concept_ancestor $OMOP_FOLDER/concept_ancestor
50
+ fi
51
+
52
+ # Reconstructing the OMOP instance from patient sequences
53
+ echo "Reconstructing the OMOP instance from patient sequences in $OMOP_FOLDER"
54
+ python -m cehrgpt.generation.omop_converter_batch \
55
+ --patient_sequence_path $PATIENT_SEQUENCE_FOLDER \
56
+ --output_folder $OMOP_FOLDER \
57
+ --concept_path $OMOP_FOLDER/concept \
58
+ --buffer_size 1280 \
59
+ --cpu_cores 10
60
+
61
+ # Create observation_period
62
+ echo "Reconstructing observation_period in $OMOP_FOLDER"
63
+ python -u -m cehrgpt.omop.observation_period \
64
+ --input_folder $OMOP_FOLDER \
65
+ --output_folder $OMOP_FOLDER \
66
+ --domain_table_list condition_occurrence drug_exposure procedure_occurrence measurement
67
+
68
+ # Create condition_era
69
+ echo "Reconstructing condition_era in $OMOP_FOLDER"
70
+ python -u -m cehrgpt.omop.condition_era \
71
+ --input_folder $OMOP_FOLDER \
72
+ --output_folder $OMOP_FOLDER \
73
+ --domain_table_list condition_occurrence