cehrgpt 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cehrgpt/analysis/irregularity.py +36 -0
- cehrgpt/data/hf_cehrgpt_dataset.py +1 -0
- cehrgpt/data/hf_cehrgpt_dataset_collator.py +398 -36
- cehrgpt/data/hf_cehrgpt_dataset_mapping.py +214 -12
- cehrgpt/data/sample_packing_sampler.py +36 -6
- cehrgpt/generation/omop_converter_batch.py +32 -2
- cehrgpt/gpt_utils.py +20 -2
- cehrgpt/models/config.py +25 -0
- cehrgpt/models/hf_cehrgpt.py +227 -33
- cehrgpt/models/hf_modeling_outputs.py +1 -0
- cehrgpt/models/special_tokens.py +1 -0
- cehrgpt/models/tokenization_hf_cehrgpt.py +354 -71
- cehrgpt/runners/data_utils.py +117 -2
- cehrgpt/runners/hf_cehrgpt_finetune_runner.py +75 -50
- cehrgpt/runners/hf_cehrgpt_pretrain_runner.py +59 -7
- cehrgpt/runners/hf_gpt_runner_argument_dataclass.py +48 -0
- cehrgpt/runners/hyperparameter_search_util.py +6 -7
- cehrgpt/runners/sample_packing_trainer.py +17 -0
- cehrgpt/time_to_event/config/1_year_cabg.yaml +23 -0
- cehrgpt/time_to_event/time_to_event_model.py +2 -13
- cehrgpt/time_to_event/time_to_event_prediction.py +27 -13
- cehrgpt/tools/linear_prob/compute_cehrgpt_features.py +85 -57
- {cehrgpt-0.1.0.dist-info → cehrgpt-0.1.1.dist-info}/METADATA +8 -7
- {cehrgpt-0.1.0.dist-info → cehrgpt-0.1.1.dist-info}/RECORD +27 -25
- {cehrgpt-0.1.0.dist-info → cehrgpt-0.1.1.dist-info}/WHEEL +1 -1
- {cehrgpt-0.1.0.dist-info → cehrgpt-0.1.1.dist-info}/licenses/LICENSE +0 -0
- {cehrgpt-0.1.0.dist-info → cehrgpt-0.1.1.dist-info}/top_level.txt +0 -0
@@ -80,20 +80,9 @@ class TimeToEventModel:
|
|
80
80
|
return token in self.outcome_events
|
81
81
|
|
82
82
|
def simulate(
|
83
|
-
self,
|
83
|
+
self,
|
84
|
+
partial_history: Union[np.ndarray, List[str]],
|
84
85
|
) -> List[List[str]]:
|
85
|
-
|
86
|
-
sequence_is_demographics = len(partial_history) == 4 and partial_history[
|
87
|
-
0
|
88
|
-
].startswith("year")
|
89
|
-
sequence_ends_ve = is_visit_end(partial_history[-1])
|
90
|
-
|
91
|
-
if not (sequence_is_demographics | sequence_ends_ve):
|
92
|
-
raise ValueError(
|
93
|
-
"There are only two types of sequences allowed. 1) the sequence only contains "
|
94
|
-
"demographics; 2) the sequence ends on VE;"
|
95
|
-
)
|
96
|
-
|
97
86
|
token_ids = self.tokenizer.encode(partial_history)
|
98
87
|
prompt = torch.tensor(token_ids).unsqueeze(0).to(self.device)
|
99
88
|
|
@@ -118,9 +118,9 @@ def main(args):
|
|
118
118
|
LOG.info(f"Top P {args.top_p}")
|
119
119
|
LOG.info(f"Top K {args.top_k}")
|
120
120
|
|
121
|
-
cehrgpt_model.resize_position_embeddings(
|
122
|
-
|
123
|
-
)
|
121
|
+
# cehrgpt_model.resize_position_embeddings(
|
122
|
+
# cehrgpt_model.config.max_position_embeddings + task_config.max_new_tokens
|
123
|
+
# )
|
124
124
|
|
125
125
|
generation_config = TimeToEventModel.get_generation_config(
|
126
126
|
tokenizer=cehrgpt_tokenizer,
|
@@ -190,14 +190,22 @@ def main(args):
|
|
190
190
|
args.max_n_trial,
|
191
191
|
)
|
192
192
|
visit_counter = sum([int(is_visit_end(_)) for _ in partial_history])
|
193
|
+
predicted_boolean_probability = (
|
194
|
+
sum([event != "0" for event in concept_time_to_event.outcome_events])
|
195
|
+
/ len(concept_time_to_event.outcome_events)
|
196
|
+
if concept_time_to_event
|
197
|
+
else 0.0
|
198
|
+
)
|
193
199
|
tte_outputs.append(
|
194
200
|
{
|
195
|
-
"
|
196
|
-
"
|
201
|
+
"subject_id": record["person_id"],
|
202
|
+
"prediction_time": record["index_date"],
|
197
203
|
"visit_counter": visit_counter,
|
198
|
-
"
|
204
|
+
"boolean_value": label,
|
205
|
+
"predicted_boolean_probability": predicted_boolean_probability,
|
206
|
+
"predicted_boolean_value": None,
|
199
207
|
"time_to_event": time_to_event,
|
200
|
-
"
|
208
|
+
"trials": (
|
201
209
|
asdict(concept_time_to_event) if concept_time_to_event else None
|
202
210
|
),
|
203
211
|
}
|
@@ -263,9 +271,13 @@ def filter_out_existing_results(
|
|
263
271
|
parquet_files = glob.glob(os.path.join(prediction_output_folder_name, "*parquet"))
|
264
272
|
if parquet_files:
|
265
273
|
cohort_members = set()
|
266
|
-
results_dataframe = pd.read_parquet(parquet_files)[
|
274
|
+
results_dataframe = pd.read_parquet(parquet_files)[
|
275
|
+
["subject_id", "prediction_time"]
|
276
|
+
]
|
267
277
|
for row in results_dataframe.itertuples():
|
268
|
-
cohort_members.add(
|
278
|
+
cohort_members.add(
|
279
|
+
(row.subject_id, row.prediction_time.strftime("%Y-%m-%d"))
|
280
|
+
)
|
269
281
|
|
270
282
|
def filter_func(batched):
|
271
283
|
return [
|
@@ -292,12 +304,14 @@ def flush_to_disk_if_full(
|
|
292
304
|
pd.DataFrame(
|
293
305
|
tte_outputs,
|
294
306
|
columns=[
|
295
|
-
"
|
296
|
-
"
|
307
|
+
"subject_id",
|
308
|
+
"prediction_time",
|
297
309
|
"visit_counter",
|
298
|
-
"
|
310
|
+
"boolean_value",
|
311
|
+
"predicted_boolean_probability",
|
312
|
+
"predicted_boolean_value",
|
299
313
|
"time_to_event",
|
300
|
-
"
|
314
|
+
"trials",
|
301
315
|
],
|
302
316
|
).to_parquet(output_parquet_file)
|
303
317
|
tte_outputs.clear()
|
@@ -9,6 +9,7 @@ from typing import Optional, Union
|
|
9
9
|
|
10
10
|
import numpy as np
|
11
11
|
import pandas as pd
|
12
|
+
import polars as pl
|
12
13
|
import torch
|
13
14
|
import torch.distributed as dist
|
14
15
|
from cehrbert.data_generators.hf_data_generator.meds_utils import CacheFileCollector
|
@@ -24,13 +25,18 @@ from cehrgpt.data.hf_cehrgpt_dataset_collator import (
|
|
24
25
|
CehrGptDataCollator,
|
25
26
|
SamplePackingCehrGptDataCollator,
|
26
27
|
)
|
28
|
+
from cehrgpt.data.hf_cehrgpt_dataset_mapping import ExtractTokenizedSequenceDataMapping
|
27
29
|
from cehrgpt.data.sample_packing_sampler import SamplePackingBatchSampler
|
28
30
|
from cehrgpt.models.hf_cehrgpt import (
|
29
31
|
CEHRGPT2Model,
|
30
32
|
extract_features_from_packed_sequence,
|
31
33
|
)
|
34
|
+
from cehrgpt.models.special_tokens import LINEAR_PROB_TOKEN
|
32
35
|
from cehrgpt.models.tokenization_hf_cehrgpt import CehrGptTokenizer
|
33
|
-
from cehrgpt.runners.data_utils import
|
36
|
+
from cehrgpt.runners.data_utils import (
|
37
|
+
extract_cohort_sequences,
|
38
|
+
prepare_finetune_dataset,
|
39
|
+
)
|
34
40
|
from cehrgpt.runners.gpt_runner_util import parse_runner_args
|
35
41
|
from cehrgpt.runners.hf_cehrgpt_pretrain_runner import tokenizer_exists
|
36
42
|
|
@@ -112,6 +118,11 @@ def main():
|
|
112
118
|
.eval()
|
113
119
|
.to(device)
|
114
120
|
)
|
121
|
+
|
122
|
+
if LINEAR_PROB_TOKEN not in cehrgpt_tokenizer.get_vocab():
|
123
|
+
cehrgpt_tokenizer.add_tokens(LINEAR_PROB_TOKEN)
|
124
|
+
cehrgpt_model.resize_token_embeddings(cehrgpt_tokenizer.vocab_size)
|
125
|
+
|
115
126
|
prepared_ds_path = generate_prepared_ds_path(
|
116
127
|
data_args, model_args, data_folder=data_args.cohort_folder
|
117
128
|
)
|
@@ -137,39 +148,48 @@ def main():
|
|
137
148
|
|
138
149
|
if processed_dataset is None:
|
139
150
|
if is_main_process(training_args.local_rank):
|
140
|
-
#
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
151
|
+
# If the full dataset has been tokenized, we don't want to tokenize the cohort containing
|
152
|
+
# the subset of the data. We should slice out the portion of the tokenized sequences for each sample
|
153
|
+
if cehrgpt_args.tokenized_full_dataset_path is not None:
|
154
|
+
processed_dataset = extract_cohort_sequences(
|
155
|
+
data_args, cehrgpt_args, cache_file_collector
|
156
|
+
)
|
157
|
+
else:
|
158
|
+
# Organize them into a single DatasetDict
|
159
|
+
final_splits = prepare_finetune_dataset(
|
160
|
+
data_args, training_args, cehrgpt_args, cache_file_collector
|
161
|
+
)
|
162
|
+
if cehrgpt_args.expand_tokenizer:
|
163
|
+
new_tokenizer_path = os.path.expanduser(training_args.output_dir)
|
164
|
+
if tokenizer_exists(new_tokenizer_path):
|
165
|
+
cehrgpt_tokenizer = CehrGptTokenizer.from_pretrained(
|
166
|
+
new_tokenizer_path
|
167
|
+
)
|
168
|
+
else:
|
169
|
+
cehrgpt_tokenizer = CehrGptTokenizer.expand_trained_tokenizer(
|
170
|
+
cehrgpt_tokenizer=cehrgpt_tokenizer,
|
171
|
+
dataset=final_splits["train"],
|
172
|
+
data_args=data_args,
|
173
|
+
concept_name_mapping={},
|
174
|
+
)
|
175
|
+
cehrgpt_tokenizer.save_pretrained(
|
176
|
+
os.path.expanduser(training_args.output_dir)
|
177
|
+
)
|
178
|
+
|
179
|
+
# TODO: temp solution, this column is mixed typed and causes an issue when transforming the data
|
180
|
+
if not data_args.streaming:
|
181
|
+
all_columns = final_splits["train"].column_names
|
182
|
+
if "visit_concept_ids" in all_columns:
|
183
|
+
final_splits = final_splits.remove_columns(
|
184
|
+
["visit_concept_ids"]
|
185
|
+
)
|
186
|
+
|
187
|
+
processed_dataset = create_cehrgpt_finetuning_dataset(
|
188
|
+
dataset=final_splits,
|
189
|
+
cehrgpt_tokenizer=cehrgpt_tokenizer,
|
190
|
+
data_args=data_args,
|
191
|
+
cache_file_collector=cache_file_collector,
|
192
|
+
)
|
173
193
|
if not data_args.streaming:
|
174
194
|
processed_dataset.save_to_disk(prepared_ds_path)
|
175
195
|
processed_dataset.cleanup_cache_files()
|
@@ -244,6 +264,7 @@ def main():
|
|
244
264
|
SamplePackingCehrGptDataCollator,
|
245
265
|
cehrgpt_args.max_tokens_per_batch,
|
246
266
|
cehrgpt_model.config.max_position_embeddings,
|
267
|
+
add_end_token_in_sample_packing=cehrgpt_args.add_end_token_in_sample_packing,
|
247
268
|
)
|
248
269
|
train_batch_sampler = SamplePackingBatchSampler(
|
249
270
|
lengths=train_set["num_of_concepts"],
|
@@ -278,6 +299,7 @@ def main():
|
|
278
299
|
include_ttv_prediction=False,
|
279
300
|
use_sub_time_tokenization=False,
|
280
301
|
include_demographics=cehrgpt_args.include_demographics,
|
302
|
+
add_linear_prob_token=True,
|
281
303
|
)
|
282
304
|
|
283
305
|
train_loader = DataLoader(
|
@@ -298,30 +320,36 @@ def main():
|
|
298
320
|
batch_sampler=test_batch_sampler,
|
299
321
|
)
|
300
322
|
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
323
|
+
if data_args.is_data_in_meds:
|
324
|
+
demographics_dict = dict()
|
325
|
+
else:
|
326
|
+
# Loading demographics
|
327
|
+
print("Loading demographics as a dictionary")
|
328
|
+
demographics_df = pd.concat(
|
329
|
+
[
|
330
|
+
pd.read_parquet(
|
331
|
+
data_dir,
|
332
|
+
columns=[
|
333
|
+
"person_id",
|
334
|
+
"index_date",
|
335
|
+
"gender_concept_id",
|
336
|
+
"race_concept_id",
|
337
|
+
],
|
338
|
+
)
|
339
|
+
for data_dir in [data_args.data_folder, data_args.test_data_folder]
|
340
|
+
]
|
341
|
+
)
|
342
|
+
# This is a pre-caution in case the index_date is not a datetime type
|
343
|
+
demographics_df["index_date"] = pd.to_datetime(
|
344
|
+
demographics_df["index_date"]
|
345
|
+
).dt.date
|
346
|
+
demographics_dict = {
|
347
|
+
(row["person_id"], row["index_date"]): {
|
348
|
+
"gender_concept_id": row["gender_concept_id"],
|
349
|
+
"race_concept_id": row["race_concept_id"],
|
350
|
+
}
|
351
|
+
for _, row in demographics_df.iterrows()
|
322
352
|
}
|
323
|
-
for _, row in demographics_df.iterrows()
|
324
|
-
}
|
325
353
|
|
326
354
|
data_loaders = [("train", train_loader), ("test", test_dataloader)]
|
327
355
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: cehrgpt
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.1
|
4
4
|
Summary: CEHR-GPT: Generating Electronic Health Records with Chronological Patient Timelines
|
5
5
|
Author-email: Chao Pang <chaopang229@gmail.com>, Xinzhuo Jiang <xj2193@cumc.columbia.edu>, Krishna Kalluri <kk3326@cumc.columbia.edu>, Elise Minto <em3697@cumc.columbia.edu>, Jason Patterson <jp3477@cumc.columbia.edu>, Nishanth Parameshwar Pavinkurve <np2689@cumc.columbia.edu>, Karthik Natarajan <kn2174@cumc.columbia.edu>
|
6
6
|
License: MIT License
|
@@ -12,14 +12,15 @@ Classifier: Programming Language :: Python :: 3
|
|
12
12
|
Requires-Python: >=3.10.0
|
13
13
|
Description-Content-Type: text/markdown
|
14
14
|
License-File: LICENSE
|
15
|
-
Requires-Dist: cehrbert==1.4.
|
16
|
-
Requires-Dist: cehrbert_data==0.0.
|
15
|
+
Requires-Dist: cehrbert==1.4.5
|
16
|
+
Requires-Dist: cehrbert_data==0.0.11
|
17
17
|
Requires-Dist: openai==1.54.3
|
18
18
|
Requires-Dist: optuna==4.0.0
|
19
|
-
Requires-Dist: transformers==4.44.
|
19
|
+
Requires-Dist: transformers==4.44.1
|
20
20
|
Requires-Dist: tokenizers==0.19.0
|
21
21
|
Requires-Dist: peft==0.10.0
|
22
22
|
Requires-Dist: lightgbm
|
23
|
+
Requires-Dist: polars
|
23
24
|
Provides-Extra: dev
|
24
25
|
Requires-Dist: pre-commit; extra == "dev"
|
25
26
|
Requires-Dist: pytest; extra == "dev"
|
@@ -36,9 +37,9 @@ Dynamic: license-file
|
|
36
37
|
|
37
38
|
[](https://pypi.org/project/cehrgpt/)
|
38
39
|

|
39
|
-
[](https://github.com/knatarajan-lab/cehrgpt
|
41
|
-
[](https://github.com/knatarajan-lab/cehrgpt/actions/workflows/tests.yaml)
|
41
|
+
[](https://github.com/knatarajan-lab/cehrgpt/blob/main/LICENSE)
|
42
|
+
[](https://github.com/knatarajan-lab/cehrgpt/graphs/contributors)
|
42
43
|
|
43
44
|
## Description
|
44
45
|
CEHRGPT is a synthetic data generation model developed to handle structured electronic health records (EHR) with enhanced privacy and reliability. It leverages state-of-the-art natural language processing techniques to create realistic, anonymized patient data that can be used for research and development without compromising patient privacy.
|
@@ -1,8 +1,9 @@
|
|
1
1
|
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
2
|
cehrgpt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
3
|
cehrgpt/cehrgpt_args.py,sha256=zPLp9Qjlq5PapWx3R15BNnyaX8zV3dxr4PuWj71r0Lg,3516
|
4
|
-
cehrgpt/gpt_utils.py,sha256=
|
4
|
+
cehrgpt/gpt_utils.py,sha256=IA5qw-hxcKkGO07AB47lDNRU6mlb9jblpKO7KeLLN78,11342
|
5
5
|
cehrgpt/analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
+
cehrgpt/analysis/irregularity.py,sha256=Rfl_daMvSh9cZ68vUwfmuH-JYCFXdAph2ITHHffYC0Y,1047
|
6
7
|
cehrgpt/analysis/privacy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
8
|
cehrgpt/analysis/privacy/attribute_inference.py,sha256=0ANVW0I5uvOl6IxQ15-vMVQd0mugOgSGReBUQQESImg,9368
|
8
9
|
cehrgpt/analysis/privacy/attribute_inference_config.yml,sha256=hfLfpBlDqqsNOynpRHK414vV24edKA6ta-inmEhM2ao,103272
|
@@ -11,22 +12,22 @@ cehrgpt/analysis/privacy/nearest_neighbor_inference.py,sha256=qoJgWW7VsUMzjMGpTa
|
|
11
12
|
cehrgpt/analysis/privacy/reid_inference.py,sha256=Pypd3QJXQNY8VljpnIEa5zeAbTZHMjQOazaL-9VsBGw,13955
|
12
13
|
cehrgpt/analysis/privacy/utils.py,sha256=CRA4H9mPLBjMQGKzZ_x_3ro3tMap-NjsMDVqSOjHSVQ,8226
|
13
14
|
cehrgpt/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
-
cehrgpt/data/hf_cehrgpt_dataset.py,sha256=
|
15
|
-
cehrgpt/data/hf_cehrgpt_dataset_collator.py,sha256=
|
16
|
-
cehrgpt/data/hf_cehrgpt_dataset_mapping.py,sha256=
|
17
|
-
cehrgpt/data/sample_packing_sampler.py,sha256=
|
15
|
+
cehrgpt/data/hf_cehrgpt_dataset.py,sha256=hwJlGW7XiJIr6cXtmwvReQf9yLZJPD-dvJGvRg5ERqU,3755
|
16
|
+
cehrgpt/data/hf_cehrgpt_dataset_collator.py,sha256=ACMXiaYnR3bKD5dRleL0_siEvhL-2HAFcy5eBgvxnH4,44412
|
17
|
+
cehrgpt/data/hf_cehrgpt_dataset_mapping.py,sha256=KU0WMjc2vT1zBAl7JJkOc8dgGxsL1uFDy4dDrv-RkII,25668
|
18
|
+
cehrgpt/data/sample_packing_sampler.py,sha256=vovGMtmhG70DRkSCeiaDEJ_rjKZ38y-YLaI1kkhFEkI,6747
|
18
19
|
cehrgpt/generation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
20
|
cehrgpt/generation/chatgpt_generation.py,sha256=SrnLwHLdNtnAOEg36gNjqfoT9yd12iyPgpZffL2AFJo,4428
|
20
21
|
cehrgpt/generation/generate_batch_hf_gpt_sequence.py,sha256=uSEh8aMmPD61nGewIaPSkIqm-2AxDjCBiu4cBfxHxU4,11503
|
21
|
-
cehrgpt/generation/omop_converter_batch.py,sha256
|
22
|
+
cehrgpt/generation/omop_converter_batch.py,sha256=LUmCD-t_6ZP1YfNDZCqYewl-XIIaIgRZ_dAxuR_VdCQ,26275
|
22
23
|
cehrgpt/generation/omop_entity.py,sha256=Q5Sr0AlyuPAm1FRPfnJO13q-u1fqRgYVHXruZ9g4xNE,19400
|
23
24
|
cehrgpt/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
24
|
-
cehrgpt/models/config.py,sha256=
|
25
|
-
cehrgpt/models/hf_cehrgpt.py,sha256=
|
26
|
-
cehrgpt/models/hf_modeling_outputs.py,sha256=
|
25
|
+
cehrgpt/models/config.py,sha256=nOAKgH5420HLCcy7n1hE7MbqR861Iq4DTutKoAd25tg,11090
|
26
|
+
cehrgpt/models/hf_cehrgpt.py,sha256=77CAkdMPgxD4xSpFU7gYGzRn6_Iv-4q7FnHpnZGsKxw,92450
|
27
|
+
cehrgpt/models/hf_modeling_outputs.py,sha256=5X4WEYKqT37phv_e5ZAv3A_N0wqdAUJLJRm6TxS6dDQ,10356
|
27
28
|
cehrgpt/models/pretrained_embeddings.py,sha256=vLLVs17TLpXRqCVEWQxGGwPHkUJUO7laNTeBuyBK_yk,3238
|
28
|
-
cehrgpt/models/special_tokens.py,sha256
|
29
|
-
cehrgpt/models/tokenization_hf_cehrgpt.py,sha256=
|
29
|
+
cehrgpt/models/special_tokens.py,sha256=lrw45B4tea4Dsajn09Cz6w5D2TfHmYXikZkgwnstu_o,521
|
30
|
+
cehrgpt/models/tokenization_hf_cehrgpt.py,sha256=cAxHTctpVBxfWfC3XcwDQavN1zwWN9Nid_Fajd5zQWQ,53159
|
30
31
|
cehrgpt/omop/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
31
32
|
cehrgpt/omop/condition_era.py,sha256=hPZALz2XaWnro_1bwIYNkI48foOJjueyg3CZ1BliCno,626
|
32
33
|
cehrgpt/omop/observation_period.py,sha256=TRMgv5Ya2RaS2im7oQ6BLC_5JL9EJYNYR62ApxIuHvg,1211
|
@@ -37,22 +38,23 @@ cehrgpt/omop/queries/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
|
|
37
38
|
cehrgpt/omop/queries/condition_era.py,sha256=LFB6vBAvshHJxtYIRkl7cfrF0kf7ay0piBKpmHBwrpE,2578
|
38
39
|
cehrgpt/omop/queries/observation_period.py,sha256=fpzr5DMNw-QLoSwp2Iatfch88E3hyhZ75usiIdG3A0U,6410
|
39
40
|
cehrgpt/runners/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
40
|
-
cehrgpt/runners/data_utils.py,sha256=
|
41
|
+
cehrgpt/runners/data_utils.py,sha256=I6k1TkiiZR8ggw3eVO16g2lVPY-Hu3b-nbrIOKlFIO0,15528
|
41
42
|
cehrgpt/runners/gpt_runner_util.py,sha256=YJQSRW9Mo4TjXSOUOTf6BUFcs1MGFiXU5T4ztKZcYhU,3485
|
42
|
-
cehrgpt/runners/hf_cehrgpt_finetune_runner.py,sha256=
|
43
|
-
cehrgpt/runners/hf_cehrgpt_pretrain_runner.py,sha256=
|
44
|
-
cehrgpt/runners/hf_gpt_runner_argument_dataclass.py,sha256=
|
45
|
-
cehrgpt/runners/hyperparameter_search_util.py,sha256=
|
46
|
-
cehrgpt/runners/sample_packing_trainer.py,sha256=
|
43
|
+
cehrgpt/runners/hf_cehrgpt_finetune_runner.py,sha256=GVbHHqf5TWGbVWlQG-XurgYH8pKRjTk8ug_ib9L9U7E,28118
|
44
|
+
cehrgpt/runners/hf_cehrgpt_pretrain_runner.py,sha256=ERSnvB38fPYVghtKQeNTZ8VfeXnoRcCHB0cWISWaZ84,26523
|
45
|
+
cehrgpt/runners/hf_gpt_runner_argument_dataclass.py,sha256=ejAFLM9g765p1fyeF5MITsiIeWHKkz9wTeFDeVgxSto,8851
|
46
|
+
cehrgpt/runners/hyperparameter_search_util.py,sha256=YWdFQ1igQs-G_wqWUrUzYraGiz8OSpSYyvid-I5nhWA,9262
|
47
|
+
cehrgpt/runners/sample_packing_trainer.py,sha256=Zb7Aqwnk8-VqrjEKUVeg5XzZWmHxXOU2sDn1YURS-FU,7960
|
47
48
|
cehrgpt/simulations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
48
49
|
cehrgpt/simulations/generate_plots.py,sha256=BTZ71r8Kah0PMorkiO3vw55_p_9U1Z8KiD3GsPfaV0s,2520
|
49
50
|
cehrgpt/simulations/run_simulation.sh,sha256=DcJ6B19jIteUO0pZ0Tc21876lB9XxQHFAxlre7MtAzk,795
|
50
51
|
cehrgpt/simulations/time_embedding_simulation.py,sha256=HZ-imXH-bN-QYZN1PAfcERmNtaWIwKjbf0UrZduwCiA,8687
|
51
52
|
cehrgpt/simulations/time_token_simulation.py,sha256=sLg8vVXydvR_zk3BbqyrlA7sDIdhFnS-s5pSKcCilSc,6057
|
52
53
|
cehrgpt/time_to_event/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
53
|
-
cehrgpt/time_to_event/time_to_event_model.py,sha256=
|
54
|
-
cehrgpt/time_to_event/time_to_event_prediction.py,sha256=
|
54
|
+
cehrgpt/time_to_event/time_to_event_model.py,sha256=Plm0bZxvlAbnMl82DTBXWvaXLvrqcdkzcP_celX8WC4,8055
|
55
|
+
cehrgpt/time_to_event/time_to_event_prediction.py,sha256=W2e7UqIV7ELdfTy997HS66vggjnhdncCKt840knI0Dw,13183
|
55
56
|
cehrgpt/time_to_event/time_to_event_utils.py,sha256=KN4hwGgxy2nJtO7osbYQBF3-HpmGUWefNfexzPYiEwc,1937
|
57
|
+
cehrgpt/time_to_event/config/1_year_cabg.yaml,sha256=SFF2-F5D02pDSMRddDrEUoERBCd0t2Hzln_xC-Mo2hA,407
|
56
58
|
cehrgpt/time_to_event/config/30_day_readmission.yaml,sha256=Hn5KnEXMtSV_CtCpmAU4wjkc0-gTXvniaH991TSbUXA,234
|
57
59
|
cehrgpt/time_to_event/config/next_visit_type_prediction.yaml,sha256=WMj2ZutEvHKIMyGG51xtXaL6MyRANKvpg9xT8ouctLc,319
|
58
60
|
cehrgpt/time_to_event/config/t2dm_hf.yaml,sha256=_oMQzh2eJTYzEaMOpmhAzbX-qmdsKlkORELL6HxOxHo,202
|
@@ -63,10 +65,10 @@ cehrgpt/tools/generate_pretrained_embeddings.py,sha256=lhFSacGv8bMld6qigKZN8Op8e
|
|
63
65
|
cehrgpt/tools/merge_synthetic_real_dataasets.py,sha256=O1dbQ32Le0t15fwymwAh9mfNVLEWuFwW53DNvESrWbY,7589
|
64
66
|
cehrgpt/tools/upload_omop_tables.py,sha256=vdBAbkeAsGPA4NsyhNjelPVj3gS8yzmS1sKNM1Qk96g,3791
|
65
67
|
cehrgpt/tools/linear_prob/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
66
|
-
cehrgpt/tools/linear_prob/compute_cehrgpt_features.py,sha256=
|
68
|
+
cehrgpt/tools/linear_prob/compute_cehrgpt_features.py,sha256=q0rmlBWDDEkjHjwcTouGUhCYa32a1vRicaDOAMsdW0I,20741
|
67
69
|
cehrgpt/tools/linear_prob/train_with_cehrgpt_features.py,sha256=w0UvzMKYGenN_KDVnbzutmy8IPLUxW5hPvpKKxDSL5U,5820
|
68
|
-
cehrgpt-0.1.
|
69
|
-
cehrgpt-0.1.
|
70
|
-
cehrgpt-0.1.
|
71
|
-
cehrgpt-0.1.
|
72
|
-
cehrgpt-0.1.
|
70
|
+
cehrgpt-0.1.1.dist-info/licenses/LICENSE,sha256=LOfC32zkfUIdGm8e_098jPbt8OHKtNWymDzxn2pA9Zk,1093
|
71
|
+
cehrgpt-0.1.1.dist-info/METADATA,sha256=VnXH74vJQZaV7VxGiIvJnFhQA0jzJQNx86yHFkygobM,4922
|
72
|
+
cehrgpt-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
73
|
+
cehrgpt-0.1.1.dist-info/top_level.txt,sha256=akNCJBbMSLV8nkOzdVzdy13hMJ5CIQURnAS_YYEDVwA,17
|
74
|
+
cehrgpt-0.1.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|