cehrgpt 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- __init__.py +0 -0
- cehrgpt/__init__.py +0 -0
- cehrgpt/analysis/__init__.py +0 -0
- cehrgpt/analysis/privacy/__init__.py +0 -0
- cehrgpt/analysis/privacy/attribute_inference.py +275 -0
- cehrgpt/analysis/privacy/attribute_inference_config.yml +8975 -0
- cehrgpt/analysis/privacy/member_inference.py +172 -0
- cehrgpt/analysis/privacy/nearest_neighbor_inference.py +189 -0
- cehrgpt/analysis/privacy/reid_inference.py +407 -0
- cehrgpt/analysis/privacy/utils.py +255 -0
- cehrgpt/cehrgpt_args.py +142 -0
- cehrgpt/data/__init__.py +0 -0
- cehrgpt/data/hf_cehrgpt_dataset.py +80 -0
- cehrgpt/data/hf_cehrgpt_dataset_collator.py +482 -0
- cehrgpt/data/hf_cehrgpt_dataset_mapping.py +116 -0
- cehrgpt/generation/__init__.py +0 -0
- cehrgpt/generation/chatgpt_generation.py +106 -0
- cehrgpt/generation/generate_batch_hf_gpt_sequence.py +333 -0
- cehrgpt/generation/omop_converter_batch.py +644 -0
- cehrgpt/generation/omop_entity.py +515 -0
- cehrgpt/gpt_utils.py +331 -0
- cehrgpt/models/__init__.py +0 -0
- cehrgpt/models/config.py +205 -0
- cehrgpt/models/hf_cehrgpt.py +1817 -0
- cehrgpt/models/hf_modeling_outputs.py +158 -0
- cehrgpt/models/pretrained_embeddings.py +82 -0
- cehrgpt/models/special_tokens.py +30 -0
- cehrgpt/models/tokenization_hf_cehrgpt.py +1077 -0
- cehrgpt/omop/__init__.py +0 -0
- cehrgpt/omop/condition_era.py +20 -0
- cehrgpt/omop/observation_period.py +43 -0
- cehrgpt/omop/omop_argparse.py +38 -0
- cehrgpt/omop/omop_table_builder.py +86 -0
- cehrgpt/omop/queries/__init__.py +0 -0
- cehrgpt/omop/queries/condition_era.py +86 -0
- cehrgpt/omop/queries/observation_period.py +135 -0
- cehrgpt/omop/sample_omop_tables.py +71 -0
- cehrgpt/runners/__init__.py +0 -0
- cehrgpt/runners/gpt_runner_util.py +99 -0
- cehrgpt/runners/hf_cehrgpt_finetune_runner.py +746 -0
- cehrgpt/runners/hf_cehrgpt_pretrain_runner.py +370 -0
- cehrgpt/runners/hf_gpt_runner_argument_dataclass.py +137 -0
- cehrgpt/runners/hyperparameter_search_util.py +223 -0
- cehrgpt/time_to_event/__init__.py +0 -0
- cehrgpt/time_to_event/config/30_day_readmission.yaml +8 -0
- cehrgpt/time_to_event/config/next_visit_type_prediction.yaml +8 -0
- cehrgpt/time_to_event/config/t2dm_hf.yaml +8 -0
- cehrgpt/time_to_event/time_to_event_model.py +226 -0
- cehrgpt/time_to_event/time_to_event_prediction.py +347 -0
- cehrgpt/time_to_event/time_to_event_utils.py +55 -0
- cehrgpt/tools/__init__.py +0 -0
- cehrgpt/tools/ehrshot_benchmark.py +74 -0
- cehrgpt/tools/generate_pretrained_embeddings.py +130 -0
- cehrgpt/tools/merge_synthetic_real_dataasets.py +218 -0
- cehrgpt/tools/upload_omop_tables.py +108 -0
- cehrgpt-0.0.1.dist-info/LICENSE +21 -0
- cehrgpt-0.0.1.dist-info/METADATA +66 -0
- cehrgpt-0.0.1.dist-info/RECORD +60 -0
- cehrgpt-0.0.1.dist-info/WHEEL +5 -0
- cehrgpt-0.0.1.dist-info/top_level.txt +2 -0
@@ -0,0 +1,218 @@
|
|
1
|
+
import argparse
|
2
|
+
import logging
|
3
|
+
import os
|
4
|
+
import shutil
|
5
|
+
from enum import Enum
|
6
|
+
from typing import List
|
7
|
+
|
8
|
+
from cehrbert_data.utils.logging_utils import add_console_logging
|
9
|
+
from cehrbert_data.utils.spark_utils import validate_table_names
|
10
|
+
from pyspark.sql import SparkSession
|
11
|
+
from pyspark.sql import functions as F
|
12
|
+
|
13
|
+
add_console_logging()
|
14
|
+
logger = logging.getLogger(__name__)
|
15
|
+
|
16
|
+
COHORT_FOLDER_NAME = "cohorts"
|
17
|
+
|
18
|
+
|
19
|
+
class MergeType(Enum):
|
20
|
+
TRAIN_AND_TEST = "train_and_test"
|
21
|
+
TEST_ONLY = "test_only"
|
22
|
+
|
23
|
+
|
24
|
+
def main(
|
25
|
+
real_omop_folder: str,
|
26
|
+
synthetic_omop_folder: str,
|
27
|
+
domain_table_list: List[str],
|
28
|
+
output_folder: str,
|
29
|
+
merge_type: str,
|
30
|
+
):
|
31
|
+
spark = SparkSession.builder.appName(
|
32
|
+
"Merge Synthetic OMOP and Real OMOP datasets"
|
33
|
+
).getOrCreate()
|
34
|
+
|
35
|
+
logger.info(
|
36
|
+
f"real_omop_folder: {real_omop_folder}\n"
|
37
|
+
f"synthetic_omop_folder: {synthetic_omop_folder}\n"
|
38
|
+
f"output_folder: {output_folder}\n"
|
39
|
+
f"domain_table_list: {domain_table_list}\n"
|
40
|
+
f"merge_type: {merge_type}\n"
|
41
|
+
)
|
42
|
+
|
43
|
+
patient_splits_folder = os.path.join(real_omop_folder, "patient_splits")
|
44
|
+
if not os.path.exists(patient_splits_folder):
|
45
|
+
raise RuntimeError(f"patient_splits must exist in {real_omop_folder}")
|
46
|
+
|
47
|
+
patient_splits = spark.read.parquet(patient_splits_folder)
|
48
|
+
patient_splits = patient_splits.select("person_id", "split")
|
49
|
+
|
50
|
+
# Generate the real patient splits
|
51
|
+
real_person = spark.read.parquet(os.path.join(real_omop_folder, "person"))
|
52
|
+
real_person = (
|
53
|
+
real_person.select("person_id")
|
54
|
+
.join(patient_splits, "person_id")
|
55
|
+
.withColumn("is_real", F.lit(1))
|
56
|
+
)
|
57
|
+
max_real_person_id = real_person.select(F.max("person_id")).collect()[0][0]
|
58
|
+
if merge_type == MergeType.TEST_ONLY.value:
|
59
|
+
real_person = real_person.where("split='test'")
|
60
|
+
synthetic_person = spark.read.parquet(os.path.join(synthetic_omop_folder, "person"))
|
61
|
+
synthetic_person = (
|
62
|
+
synthetic_person.select("person_id")
|
63
|
+
.withColumn("split", F.lit("train"))
|
64
|
+
.withColumn("is_real", F.lit(0))
|
65
|
+
)
|
66
|
+
merge_patient_splits = real_person.unionByName(synthetic_person)
|
67
|
+
merge_patient_splits = merge_patient_splits.withColumn(
|
68
|
+
"new_person_id",
|
69
|
+
F.when(F.col("is_real") == 1, F.col("person_id")).otherwise(
|
70
|
+
F.col("person_id") + F.lit(max_real_person_id)
|
71
|
+
),
|
72
|
+
)
|
73
|
+
merge_patient_splits.cache()
|
74
|
+
|
75
|
+
# re-assign visit_occurrence_id
|
76
|
+
real_visit_occurrence = spark.read.parquet(
|
77
|
+
os.path.join(real_omop_folder, "visit_occurrence")
|
78
|
+
)
|
79
|
+
max_real_visit_occurrence_id = real_visit_occurrence.select(
|
80
|
+
F.max("visit_occurrence_id")
|
81
|
+
).collect()[0][0]
|
82
|
+
|
83
|
+
synthetic_visit_occurrence_mapping = (
|
84
|
+
spark.read.parquet(os.path.join(synthetic_omop_folder, "visit_occurrence"))
|
85
|
+
.select("visit_occurrence_id")
|
86
|
+
.withColumn(
|
87
|
+
"new_visit_occurrence_id",
|
88
|
+
F.col("visit_occurrence_id") + F.lit(max_real_visit_occurrence_id),
|
89
|
+
)
|
90
|
+
)
|
91
|
+
|
92
|
+
for domain_table in domain_table_list:
|
93
|
+
real_domain_table = spark.read.parquet(
|
94
|
+
os.path.join(real_omop_folder, domain_table)
|
95
|
+
)
|
96
|
+
synthetic_domain_table = spark.read.parquet(
|
97
|
+
os.path.join(synthetic_omop_folder, domain_table)
|
98
|
+
)
|
99
|
+
# The synthetic and real datasets should have the same schema, this is the pre-caution to make sure the columns
|
100
|
+
# exist in both datasets
|
101
|
+
real_columns = real_domain_table.schema.fieldNames()
|
102
|
+
synthetic_columns = synthetic_domain_table.schema.fieldNames()
|
103
|
+
common_columns = [f for f in synthetic_columns if f in real_columns]
|
104
|
+
|
105
|
+
real_domain_table = real_domain_table.join(
|
106
|
+
merge_patient_splits,
|
107
|
+
(real_domain_table["person_id"] == merge_patient_splits["person_id"])
|
108
|
+
& (merge_patient_splits["is_real"] == 1),
|
109
|
+
).select(
|
110
|
+
[real_domain_table[f] for f in common_columns]
|
111
|
+
+ [
|
112
|
+
merge_patient_splits["is_real"],
|
113
|
+
merge_patient_splits["split"],
|
114
|
+
merge_patient_splits["new_person_id"],
|
115
|
+
]
|
116
|
+
)
|
117
|
+
|
118
|
+
synthetic_domain_table = synthetic_domain_table.join(
|
119
|
+
merge_patient_splits,
|
120
|
+
(synthetic_domain_table["person_id"] == merge_patient_splits["person_id"])
|
121
|
+
& (merge_patient_splits["is_real"] == 0),
|
122
|
+
).select(
|
123
|
+
[synthetic_domain_table[f] for f in common_columns]
|
124
|
+
+ [
|
125
|
+
merge_patient_splits["is_real"],
|
126
|
+
merge_patient_splits["split"],
|
127
|
+
merge_patient_splits["new_person_id"],
|
128
|
+
]
|
129
|
+
)
|
130
|
+
# Re-map visit_occurrence_id
|
131
|
+
if "visit_occurrence_id" in [
|
132
|
+
_.lower() for _ in synthetic_domain_table.schema.fieldNames()
|
133
|
+
]:
|
134
|
+
synthetic_domain_table = (
|
135
|
+
synthetic_domain_table.join(
|
136
|
+
synthetic_visit_occurrence_mapping, "visit_occurrence_id"
|
137
|
+
)
|
138
|
+
.drop("visit_occurrence_id")
|
139
|
+
.withColumnRenamed("new_visit_occurrence_id", "visit_occurrence_id")
|
140
|
+
)
|
141
|
+
|
142
|
+
merge_domain_table = real_domain_table.unionByName(synthetic_domain_table)
|
143
|
+
merge_domain_table = (
|
144
|
+
merge_domain_table.withColumnRenamed("person_id", "original_person_id")
|
145
|
+
.withColumnRenamed("new_person_id", "person_id")
|
146
|
+
.drop("new_person_id")
|
147
|
+
)
|
148
|
+
merge_domain_table.write.mode("overwrite").parquet(
|
149
|
+
os.path.join(output_folder, domain_table)
|
150
|
+
)
|
151
|
+
|
152
|
+
# Rename the columns for the patient splits dataframe
|
153
|
+
merge_patient_splits.withColumnRenamed(
|
154
|
+
"person_id", "original_person_id"
|
155
|
+
).withColumnRenamed("new_person_id", "person_id").write.mode("overwrite").parquet(
|
156
|
+
os.path.join(output_folder, "patient_splits")
|
157
|
+
)
|
158
|
+
|
159
|
+
# Copy concept tables
|
160
|
+
for concept_table in ["concept", "concept_relationship", "concept_ancestor"]:
|
161
|
+
shutil.copytree(
|
162
|
+
os.path.join(real_omop_folder, concept_table),
|
163
|
+
os.path.join(output_folder, concept_table),
|
164
|
+
)
|
165
|
+
|
166
|
+
|
167
|
+
def create_app_arg_parser() -> argparse.ArgumentParser:
|
168
|
+
parser = argparse.ArgumentParser(
|
169
|
+
description="Arguments for generate training data for Bert"
|
170
|
+
)
|
171
|
+
parser.add_argument(
|
172
|
+
"--real_omop_folder",
|
173
|
+
dest="real_omop_folder",
|
174
|
+
action="store",
|
175
|
+
help="The path for your input_folder where the Real OMOP folder is",
|
176
|
+
required=True,
|
177
|
+
)
|
178
|
+
parser.add_argument(
|
179
|
+
"--synthetic_omop_folder",
|
180
|
+
dest="synthetic_omop_folder",
|
181
|
+
action="store",
|
182
|
+
help="The path for your input_folder where the Synthetic OMOP folder is",
|
183
|
+
required=True,
|
184
|
+
)
|
185
|
+
parser.add_argument(
|
186
|
+
"--domain_table_list",
|
187
|
+
dest="domain_table_list",
|
188
|
+
nargs="+",
|
189
|
+
action="store",
|
190
|
+
help="The list of domain tables you want to download",
|
191
|
+
type=validate_table_names,
|
192
|
+
required=True,
|
193
|
+
)
|
194
|
+
parser.add_argument(
|
195
|
+
"--output_folder",
|
196
|
+
dest="output_folder",
|
197
|
+
action="store",
|
198
|
+
help="The path for your output_folder",
|
199
|
+
required=True,
|
200
|
+
)
|
201
|
+
parser.add_argument(
|
202
|
+
"--merge_type",
|
203
|
+
dest="merge_type",
|
204
|
+
action="store",
|
205
|
+
choices=[e.value for e in MergeType],
|
206
|
+
)
|
207
|
+
return parser
|
208
|
+
|
209
|
+
|
210
|
+
if __name__ == "__main__":
|
211
|
+
ARGS = create_app_arg_parser().parse_args()
|
212
|
+
main(
|
213
|
+
ARGS.real_omop_folder,
|
214
|
+
ARGS.synthetic_omop_folder,
|
215
|
+
ARGS.domain_table_list,
|
216
|
+
ARGS.output_folder,
|
217
|
+
ARGS.merge_type,
|
218
|
+
)
|
@@ -0,0 +1,108 @@
|
|
1
|
+
import argparse
|
2
|
+
import configparser
|
3
|
+
from pathlib import Path
|
4
|
+
|
5
|
+
import pyspark.sql.functions as f
|
6
|
+
from pyspark.sql import SparkSession
|
7
|
+
|
8
|
+
# Define timestamp column for filtering based on the folder name
|
9
|
+
omop_timestamp_dict = {
|
10
|
+
"person": "birth_datetime",
|
11
|
+
"condition_occurrence": "condition_start_date",
|
12
|
+
"measurement": "measurement_date",
|
13
|
+
"drug_exposure": "drug_exposure_start_date",
|
14
|
+
"procedure_occurrence": "procedure_date",
|
15
|
+
"observation": "observation_date",
|
16
|
+
"visit_occurrence": "visit_start_date",
|
17
|
+
}
|
18
|
+
|
19
|
+
|
20
|
+
# Function to initialize and return the SparkSession
|
21
|
+
def get_spark_session():
|
22
|
+
spark = (
|
23
|
+
SparkSession.builder.appName("OMOP Upload")
|
24
|
+
.config("spark.sql.legacy.parquet.int96RebaseModeInRead", "CORRECTED")
|
25
|
+
.config("spark.sql.legacy.parquet.int96RebaseModeInWrite", "CORRECTED")
|
26
|
+
.config("spark.sql.legacy.parquet.datetimeRebaseModeInRead", "CORRECTED")
|
27
|
+
.config("spark.sql.legacy.parquet.datetimeRebaseModeInWrite", "CORRECTED")
|
28
|
+
.getOrCreate()
|
29
|
+
)
|
30
|
+
return spark
|
31
|
+
|
32
|
+
|
33
|
+
# Function to upload OMOP tables to a database
|
34
|
+
def upload_omop_tables(spark, domain_table_folder, db_properties):
|
35
|
+
# Load parquet file from the specified folder
|
36
|
+
df = spark.read.format("parquet").load(str(domain_table_folder) + "/")
|
37
|
+
|
38
|
+
# Filter dates outside of the acceptable range
|
39
|
+
if domain_table_folder.name in omop_timestamp_dict:
|
40
|
+
timestamp_column = omop_timestamp_dict[domain_table_folder.name]
|
41
|
+
df = df.filter(f.col(timestamp_column) > f.lit("1900-01-01").cast("date"))
|
42
|
+
df = df.filter(f.col(timestamp_column) < f.lit("9999-01-01").cast("date"))
|
43
|
+
|
44
|
+
# Cast appropriate columns to integer and date types
|
45
|
+
for column in df.columns:
|
46
|
+
if "concept_id" in column:
|
47
|
+
df = df.withColumn(column, f.col(column).cast("integer"))
|
48
|
+
if "date" in column:
|
49
|
+
df = df.withColumn(column, f.col(column).cast("date"))
|
50
|
+
|
51
|
+
# Write to the database with specified options
|
52
|
+
df.repartition(10).write.format("jdbc").options(
|
53
|
+
url=db_properties["base_url"],
|
54
|
+
dbtable=domain_table_folder.name,
|
55
|
+
user=db_properties["user"],
|
56
|
+
password=db_properties["password"],
|
57
|
+
batchsize=200000,
|
58
|
+
queryTimeout=500,
|
59
|
+
).mode("overwrite").save()
|
60
|
+
|
61
|
+
|
62
|
+
# Main function to process the folders and upload tables
|
63
|
+
def main(credential_path, input_folder):
|
64
|
+
# Load database properties from the credentials file
|
65
|
+
config = configparser.ConfigParser()
|
66
|
+
config.read(credential_path)
|
67
|
+
db_properties = config.defaults()
|
68
|
+
|
69
|
+
# Initialize SparkSession
|
70
|
+
spark = get_spark_session()
|
71
|
+
|
72
|
+
# Process each folder within the input folder
|
73
|
+
input_folder = Path(input_folder)
|
74
|
+
uploaded_tables = []
|
75
|
+
for folder in input_folder.glob("*"):
|
76
|
+
try:
|
77
|
+
if folder.is_dir() and folder.name in omop_timestamp_dict:
|
78
|
+
upload_omop_tables(spark, folder, db_properties)
|
79
|
+
uploaded_tables.append(folder.name)
|
80
|
+
print(f"Table: {folder.name} uploaded successfully")
|
81
|
+
except Exception as e:
|
82
|
+
print(f"Error uploading table {folder.name}: {e}")
|
83
|
+
|
84
|
+
print(f"Uploaded tables: {uploaded_tables}")
|
85
|
+
|
86
|
+
|
87
|
+
# Argument parsing moved under __name__ == "__main__"
|
88
|
+
if __name__ == "__main__":
|
89
|
+
parser = argparse.ArgumentParser(description="Arguments for uploading OMOP tables")
|
90
|
+
|
91
|
+
parser.add_argument(
|
92
|
+
"-c",
|
93
|
+
"--credential_path",
|
94
|
+
required=True,
|
95
|
+
help="The path for your database credentials",
|
96
|
+
)
|
97
|
+
|
98
|
+
parser.add_argument(
|
99
|
+
"-i",
|
100
|
+
"--input_folder",
|
101
|
+
required=True,
|
102
|
+
help="Path to the input folder containing the OMOP tables",
|
103
|
+
)
|
104
|
+
|
105
|
+
args = parser.parse_args()
|
106
|
+
|
107
|
+
# Call the main function with parsed arguments
|
108
|
+
main(args.credential_path, args.input_folder)
|
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2023 Department of Biomedical Informatics
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
@@ -0,0 +1,66 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: cehrgpt
|
3
|
+
Version: 0.0.1
|
4
|
+
Summary: CEHR-GPT: Generating Electronic Health Records with Chronological Patient Timelines
|
5
|
+
Author-email: Chao Pang <chaopang229@gmail.com>, Xinzhuo Jiang <xj2193@cumc.columbia.edu>, Krishna Kalluri <kk3326@cumc.columbia.edu>, Elise Minto <em3697@cumc.columbia.edu>, Jason Patterson <jp3477@cumc.columbia.edu>, Nishanth Parameshwar Pavinkurve <np2689@cumc.columbia.edu>, Karthik Natarajan <kn2174@cumc.columbia.edu>
|
6
|
+
License: MIT License
|
7
|
+
Classifier: Development Status :: 5 - Production/Stable
|
8
|
+
Classifier: Intended Audience :: Developers
|
9
|
+
Classifier: Intended Audience :: Science/Research
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
12
|
+
Requires-Python: >=3.10.0
|
13
|
+
Description-Content-Type: text/markdown
|
14
|
+
License-File: LICENSE
|
15
|
+
Requires-Dist: cehrbert==1.3.3
|
16
|
+
Requires-Dist: openai==1.54.3
|
17
|
+
Requires-Dist: optuna==4.0.0
|
18
|
+
Requires-Dist: transformers==4.40.0
|
19
|
+
Requires-Dist: tokenizers==0.19
|
20
|
+
Requires-Dist: trl==0.11.4
|
21
|
+
Provides-Extra: dev
|
22
|
+
Requires-Dist: pre-commit; extra == "dev"
|
23
|
+
Requires-Dist: pytest; extra == "dev"
|
24
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
25
|
+
Requires-Dist: pytest-subtests; extra == "dev"
|
26
|
+
Requires-Dist: rootutils; extra == "dev"
|
27
|
+
Requires-Dist: hypothesis; extra == "dev"
|
28
|
+
Requires-Dist: black; extra == "dev"
|
29
|
+
Provides-Extra: flash-attn
|
30
|
+
Requires-Dist: flash_attn; extra == "flash-attn"
|
31
|
+
|
32
|
+
# CEHRGPT
|
33
|
+
|
34
|
+
[](https://pypi.org/project/cehrgpt/)
|
35
|
+

|
36
|
+
[](https://github.com/knatarajan-lab/cehrgpt-public/actions/workflows/tests.yml)
|
37
|
+
[](https://github.com/knatarajan-lab/cehrgpt-public/blob/main/LICENSE)
|
38
|
+
[](https://github.com/knatarajan-lab/cehrgpt-public/graphs/contributors)
|
39
|
+
|
40
|
+
## Description
|
41
|
+
CEHRGPT is a synthetic data generation model developed to handle structured electronic health records (EHR) with enhanced privacy and reliability. It leverages state-of-the-art natural language processing techniques to create realistic, anonymized patient data that can be used for research and development without compromising patient privacy.
|
42
|
+
|
43
|
+
## Features
|
44
|
+
- **Synthetic Patient Data Generation**: Generates comprehensive patient profiles including demographics, medical history, treatment courses, and outcomes.
|
45
|
+
- **Privacy-Preserving**: Implements techniques to ensure the generated data does not reveal identifiable information.
|
46
|
+
- **Compatibility with OMOP**: Fully compatible with the OMOP common data model, allowing seamless integration with existing healthcare data systems.
|
47
|
+
- **Extensible**: Designed to be adaptable to new datasets and different EHR systems.
|
48
|
+
|
49
|
+
## Installation
|
50
|
+
To install CEHRGPT, clone this repository and install the required dependencies.
|
51
|
+
|
52
|
+
```bash
|
53
|
+
git clone https://github.com/knatarajan-lab/cehrgpt-public.git
|
54
|
+
cd cehrgpt-public
|
55
|
+
pip install .
|
56
|
+
```
|
57
|
+
|
58
|
+
## Citation
|
59
|
+
```
|
60
|
+
@article{cehrgpt2024,
|
61
|
+
title={CEHRGPT: Synthetic Data Generation for Electronic Health Records},
|
62
|
+
author={Natarajan, K and others},
|
63
|
+
journal={arXiv preprint arXiv:2402.04400},
|
64
|
+
year={2024}
|
65
|
+
}
|
66
|
+
```
|
@@ -0,0 +1,60 @@
|
|
1
|
+
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
cehrgpt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
+
cehrgpt/cehrgpt_args.py,sha256=zPLp9Qjlq5PapWx3R15BNnyaX8zV3dxr4PuWj71r0Lg,3516
|
4
|
+
cehrgpt/gpt_utils.py,sha256=bksHCXMX4j859VSv1Q284rVr4gn1Y8dCx4a_V-g4mug,10939
|
5
|
+
cehrgpt/analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
+
cehrgpt/analysis/privacy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
+
cehrgpt/analysis/privacy/attribute_inference.py,sha256=0ANVW0I5uvOl6IxQ15-vMVQd0mugOgSGReBUQQESImg,9368
|
8
|
+
cehrgpt/analysis/privacy/attribute_inference_config.yml,sha256=hfLfpBlDqqsNOynpRHK414vV24edKA6ta-inmEhM2ao,103272
|
9
|
+
cehrgpt/analysis/privacy/member_inference.py,sha256=a_-4rkYYffYl0ucnjK6uYy8jesupt8vDObIg3l-mp4M,5903
|
10
|
+
cehrgpt/analysis/privacy/nearest_neighbor_inference.py,sha256=qoJgWW7VsUMzjMGpTaK84iY_QLOuF3HCYXAEKLZOZsU,6391
|
11
|
+
cehrgpt/analysis/privacy/reid_inference.py,sha256=Pypd3QJXQNY8VljpnIEa5zeAbTZHMjQOazaL-9VsBGw,13955
|
12
|
+
cehrgpt/analysis/privacy/utils.py,sha256=CRA4H9mPLBjMQGKzZ_x_3ro3tMap-NjsMDVqSOjHSVQ,8226
|
13
|
+
cehrgpt/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
+
cehrgpt/data/hf_cehrgpt_dataset.py,sha256=7hvjjqE8WInVuRvAtNkFI_J-xluFBv1Ij4TPTdUxPM4,2570
|
15
|
+
cehrgpt/data/hf_cehrgpt_dataset_collator.py,sha256=RYw5Isrwa4sdyQQ3Nf3cu7xPDA3m-c5ecCFf_y1TJKY,20497
|
16
|
+
cehrgpt/data/hf_cehrgpt_dataset_mapping.py,sha256=aQ0gsThOFhrh9ExpJhRmuiwN9ShIKheLgCIci-N7HOM,4305
|
17
|
+
cehrgpt/generation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
|
+
cehrgpt/generation/chatgpt_generation.py,sha256=SrnLwHLdNtnAOEg36gNjqfoT9yd12iyPgpZffL2AFJo,4428
|
19
|
+
cehrgpt/generation/generate_batch_hf_gpt_sequence.py,sha256=-WLpKlulVVDJSdA2jXyp87gfLW4Q3aAtwULK8fDtn_E,11408
|
20
|
+
cehrgpt/generation/omop_converter_batch.py,sha256=SDpWjqzi8dsgVzbbFes42GMdZEvrJ3sm4RbP5UpmIlk,25280
|
21
|
+
cehrgpt/generation/omop_entity.py,sha256=Q5Sr0AlyuPAm1FRPfnJO13q-u1fqRgYVHXruZ9g4xNE,19400
|
22
|
+
cehrgpt/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
|
+
cehrgpt/models/config.py,sha256=xek4W_siO7WtMAKE7zDsENotsIE70F8dcW-PTC0kBKk,9700
|
24
|
+
cehrgpt/models/hf_cehrgpt.py,sha256=YrHhT8c92xcOVTb6FjFQokyHrDOcXgEDMBs0BksSBpA,75739
|
25
|
+
cehrgpt/models/hf_modeling_outputs.py,sha256=LaWa1jI6BRIKMEjWOy1QUeOfTur5y_p2c-JyuGVTdtw,10301
|
26
|
+
cehrgpt/models/pretrained_embeddings.py,sha256=vLLVs17TLpXRqCVEWQxGGwPHkUJUO7laNTeBuyBK_yk,3238
|
27
|
+
cehrgpt/models/special_tokens.py,sha256=-a7HPJBbdIH0qQ6B3CcRKqvpG6FZlm4nbVPTswGSJ4U,485
|
28
|
+
cehrgpt/models/tokenization_hf_cehrgpt.py,sha256=jQR5aHjdHhS14nC1qnqDmybS1gpB27WK2-qVNz9cxW0,42156
|
29
|
+
cehrgpt/omop/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
30
|
+
cehrgpt/omop/condition_era.py,sha256=hPZALz2XaWnro_1bwIYNkI48foOJjueyg3CZ1BliCno,626
|
31
|
+
cehrgpt/omop/observation_period.py,sha256=TRMgv5Ya2RaS2im7oQ6BLC_5JL9EJYNYR62ApxIuHvg,1211
|
32
|
+
cehrgpt/omop/omop_argparse.py,sha256=WI_-vZGfPdZ8atIeB-CrpaPdkv07kDBabyEpaRZfl64,998
|
33
|
+
cehrgpt/omop/omop_table_builder.py,sha256=6K_YYKyayDUBwxUdwaliI5tufpfIQqByDY5HeBbjHok,2742
|
34
|
+
cehrgpt/omop/sample_omop_tables.py,sha256=2JZ8BNSvssceinwFanvuCRh-YlKrKn25U9w1pL79kQ0,2300
|
35
|
+
cehrgpt/omop/queries/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
36
|
+
cehrgpt/omop/queries/condition_era.py,sha256=LFB6vBAvshHJxtYIRkl7cfrF0kf7ay0piBKpmHBwrpE,2578
|
37
|
+
cehrgpt/omop/queries/observation_period.py,sha256=fpzr5DMNw-QLoSwp2Iatfch88E3hyhZ75usiIdG3A0U,6410
|
38
|
+
cehrgpt/runners/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
39
|
+
cehrgpt/runners/gpt_runner_util.py,sha256=88HKSVj-ADGBCMo7C3znKSMPnAAALa1iU_6P6i9sD0M,3867
|
40
|
+
cehrgpt/runners/hf_cehrgpt_finetune_runner.py,sha256=aGw87ZJuUIH196ryaZzt9D4hCAHVcDyKnvvdVPdipwc,31568
|
41
|
+
cehrgpt/runners/hf_cehrgpt_pretrain_runner.py,sha256=6xulvnjwy6LDRPIL_zgsYH7sJMiXJ9AvFg3p2o35S6c,16510
|
42
|
+
cehrgpt/runners/hf_gpt_runner_argument_dataclass.py,sha256=2l1X5bp1zckoFp0rQkxGptXyG8u3PgNw0dqYVDWLYjg,5155
|
43
|
+
cehrgpt/runners/hyperparameter_search_util.py,sha256=i4qAb_22JO78l40MSyBPwDgAGuGc96efXmg_833cSSo,9044
|
44
|
+
cehrgpt/time_to_event/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
45
|
+
cehrgpt/time_to_event/time_to_event_model.py,sha256=tfXa24l_0q1TBZ68BPRrHRC_3KRWYxrWGIv4myJlIb8,8497
|
46
|
+
cehrgpt/time_to_event/time_to_event_prediction.py,sha256=Ajesq2gSsILghWHCTLiiBhcyOCa7m6JPPMdi_xvBlR4,12624
|
47
|
+
cehrgpt/time_to_event/time_to_event_utils.py,sha256=KN4hwGgxy2nJtO7osbYQBF3-HpmGUWefNfexzPYiEwc,1937
|
48
|
+
cehrgpt/time_to_event/config/30_day_readmission.yaml,sha256=Hn5KnEXMtSV_CtCpmAU4wjkc0-gTXvniaH991TSbUXA,234
|
49
|
+
cehrgpt/time_to_event/config/next_visit_type_prediction.yaml,sha256=WMj2ZutEvHKIMyGG51xtXaL6MyRANKvpg9xT8ouctLc,319
|
50
|
+
cehrgpt/time_to_event/config/t2dm_hf.yaml,sha256=_oMQzh2eJTYzEaMOpmhAzbX-qmdsKlkORELL6HxOxHo,202
|
51
|
+
cehrgpt/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
52
|
+
cehrgpt/tools/ehrshot_benchmark.py,sha256=E-m_5srlYEw7Y7i9twIJWDvrkwNlop-6yZB-80FZid0,2667
|
53
|
+
cehrgpt/tools/generate_pretrained_embeddings.py,sha256=lhFSacGv8bMld6qigKZN8Op8eXpFi0DsJuQbWKOWXqI,4160
|
54
|
+
cehrgpt/tools/merge_synthetic_real_dataasets.py,sha256=O1dbQ32Le0t15fwymwAh9mfNVLEWuFwW53DNvESrWbY,7589
|
55
|
+
cehrgpt/tools/upload_omop_tables.py,sha256=vdBAbkeAsGPA4NsyhNjelPVj3gS8yzmS1sKNM1Qk96g,3791
|
56
|
+
cehrgpt-0.0.1.dist-info/LICENSE,sha256=LOfC32zkfUIdGm8e_098jPbt8OHKtNWymDzxn2pA9Zk,1093
|
57
|
+
cehrgpt-0.0.1.dist-info/METADATA,sha256=BZrsoZe0Smn4JoA3cCI63fC4nBvOVrC9sgZ0Ct1NJsA,3388
|
58
|
+
cehrgpt-0.0.1.dist-info/WHEEL,sha256=nn6H5-ilmfVryoAQl3ZQ2l8SH5imPWFpm1A5FgEuFV4,91
|
59
|
+
cehrgpt-0.0.1.dist-info/top_level.txt,sha256=akNCJBbMSLV8nkOzdVzdy13hMJ5CIQURnAS_YYEDVwA,17
|
60
|
+
cehrgpt-0.0.1.dist-info/RECORD,,
|