cehrgpt 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -80,20 +80,9 @@ class TimeToEventModel:
80
80
  return token in self.outcome_events
81
81
 
82
82
  def simulate(
83
- self, partial_history: Union[np.ndarray, List[str]]
83
+ self,
84
+ partial_history: Union[np.ndarray, List[str]],
84
85
  ) -> List[List[str]]:
85
-
86
- sequence_is_demographics = len(partial_history) == 4 and partial_history[
87
- 0
88
- ].startswith("year")
89
- sequence_ends_ve = is_visit_end(partial_history[-1])
90
-
91
- if not (sequence_is_demographics | sequence_ends_ve):
92
- raise ValueError(
93
- "There are only two types of sequences allowed. 1) the sequence only contains "
94
- "demographics; 2) the sequence ends on VE;"
95
- )
96
-
97
86
  token_ids = self.tokenizer.encode(partial_history)
98
87
  prompt = torch.tensor(token_ids).unsqueeze(0).to(self.device)
99
88
 
@@ -118,9 +118,9 @@ def main(args):
118
118
  LOG.info(f"Top P {args.top_p}")
119
119
  LOG.info(f"Top K {args.top_k}")
120
120
 
121
- cehrgpt_model.resize_position_embeddings(
122
- cehrgpt_model.config.max_position_embeddings + task_config.max_new_tokens
123
- )
121
+ # cehrgpt_model.resize_position_embeddings(
122
+ # cehrgpt_model.config.max_position_embeddings + task_config.max_new_tokens
123
+ # )
124
124
 
125
125
  generation_config = TimeToEventModel.get_generation_config(
126
126
  tokenizer=cehrgpt_tokenizer,
@@ -190,14 +190,22 @@ def main(args):
190
190
  args.max_n_trial,
191
191
  )
192
192
  visit_counter = sum([int(is_visit_end(_)) for _ in partial_history])
193
+ predicted_boolean_probability = (
194
+ sum([event != "0" for event in concept_time_to_event.outcome_events])
195
+ / len(concept_time_to_event.outcome_events)
196
+ if concept_time_to_event
197
+ else 0.0
198
+ )
193
199
  tte_outputs.append(
194
200
  {
195
- "person_id": record["person_id"],
196
- "index_date": record["index_date"],
201
+ "subject_id": record["person_id"],
202
+ "prediction_time": record["index_date"],
197
203
  "visit_counter": visit_counter,
198
- "label": label,
204
+ "boolean_value": label,
205
+ "predicted_boolean_probability": predicted_boolean_probability,
206
+ "predicted_boolean_value": None,
199
207
  "time_to_event": time_to_event,
200
- "prediction": (
208
+ "trials": (
201
209
  asdict(concept_time_to_event) if concept_time_to_event else None
202
210
  ),
203
211
  }
@@ -263,9 +271,13 @@ def filter_out_existing_results(
263
271
  parquet_files = glob.glob(os.path.join(prediction_output_folder_name, "*parquet"))
264
272
  if parquet_files:
265
273
  cohort_members = set()
266
- results_dataframe = pd.read_parquet(parquet_files)[["person_id", "index_date"]]
274
+ results_dataframe = pd.read_parquet(parquet_files)[
275
+ ["subject_id", "prediction_time"]
276
+ ]
267
277
  for row in results_dataframe.itertuples():
268
- cohort_members.add((row.person_id, row.index_date.strftime("%Y-%m-%d")))
278
+ cohort_members.add(
279
+ (row.subject_id, row.prediction_time.strftime("%Y-%m-%d"))
280
+ )
269
281
 
270
282
  def filter_func(batched):
271
283
  return [
@@ -292,12 +304,14 @@ def flush_to_disk_if_full(
292
304
  pd.DataFrame(
293
305
  tte_outputs,
294
306
  columns=[
295
- "person_id",
296
- "index_date",
307
+ "subject_id",
308
+ "prediction_time",
297
309
  "visit_counter",
298
- "label",
310
+ "boolean_value",
311
+ "predicted_boolean_probability",
312
+ "predicted_boolean_value",
299
313
  "time_to_event",
300
- "prediction",
314
+ "trials",
301
315
  ],
302
316
  ).to_parquet(output_parquet_file)
303
317
  tte_outputs.clear()
@@ -9,6 +9,7 @@ from typing import Optional, Union
9
9
 
10
10
  import numpy as np
11
11
  import pandas as pd
12
+ import polars as pl
12
13
  import torch
13
14
  import torch.distributed as dist
14
15
  from cehrbert.data_generators.hf_data_generator.meds_utils import CacheFileCollector
@@ -24,13 +25,18 @@ from cehrgpt.data.hf_cehrgpt_dataset_collator import (
24
25
  CehrGptDataCollator,
25
26
  SamplePackingCehrGptDataCollator,
26
27
  )
28
+ from cehrgpt.data.hf_cehrgpt_dataset_mapping import ExtractTokenizedSequenceDataMapping
27
29
  from cehrgpt.data.sample_packing_sampler import SamplePackingBatchSampler
28
30
  from cehrgpt.models.hf_cehrgpt import (
29
31
  CEHRGPT2Model,
30
32
  extract_features_from_packed_sequence,
31
33
  )
34
+ from cehrgpt.models.special_tokens import LINEAR_PROB_TOKEN
32
35
  from cehrgpt.models.tokenization_hf_cehrgpt import CehrGptTokenizer
33
- from cehrgpt.runners.data_utils import prepare_finetune_dataset
36
+ from cehrgpt.runners.data_utils import (
37
+ extract_cohort_sequences,
38
+ prepare_finetune_dataset,
39
+ )
34
40
  from cehrgpt.runners.gpt_runner_util import parse_runner_args
35
41
  from cehrgpt.runners.hf_cehrgpt_pretrain_runner import tokenizer_exists
36
42
 
@@ -112,6 +118,11 @@ def main():
112
118
  .eval()
113
119
  .to(device)
114
120
  )
121
+
122
+ if LINEAR_PROB_TOKEN not in cehrgpt_tokenizer.get_vocab():
123
+ cehrgpt_tokenizer.add_tokens(LINEAR_PROB_TOKEN)
124
+ cehrgpt_model.resize_token_embeddings(cehrgpt_tokenizer.vocab_size)
125
+
115
126
  prepared_ds_path = generate_prepared_ds_path(
116
127
  data_args, model_args, data_folder=data_args.cohort_folder
117
128
  )
@@ -137,39 +148,48 @@ def main():
137
148
 
138
149
  if processed_dataset is None:
139
150
  if is_main_process(training_args.local_rank):
140
- # Organize them into a single DatasetDict
141
- final_splits = prepare_finetune_dataset(
142
- data_args, training_args, cehrgpt_args, cache_file_collector
143
- )
144
- if cehrgpt_args.expand_tokenizer:
145
- new_tokenizer_path = os.path.expanduser(training_args.output_dir)
146
- if tokenizer_exists(new_tokenizer_path):
147
- cehrgpt_tokenizer = CehrGptTokenizer.from_pretrained(
148
- new_tokenizer_path
149
- )
150
- else:
151
- cehrgpt_tokenizer = CehrGptTokenizer.expand_trained_tokenizer(
152
- cehrgpt_tokenizer=cehrgpt_tokenizer,
153
- dataset=final_splits["train"],
154
- data_args=data_args,
155
- concept_name_mapping={},
156
- )
157
- cehrgpt_tokenizer.save_pretrained(
158
- os.path.expanduser(training_args.output_dir)
159
- )
160
-
161
- # TODO: temp solution, this column is mixed typed and causes an issue when transforming the data
162
- if not data_args.streaming:
163
- all_columns = final_splits["train"].column_names
164
- if "visit_concept_ids" in all_columns:
165
- final_splits = final_splits.remove_columns(["visit_concept_ids"])
166
-
167
- processed_dataset = create_cehrgpt_finetuning_dataset(
168
- dataset=final_splits,
169
- cehrgpt_tokenizer=cehrgpt_tokenizer,
170
- data_args=data_args,
171
- cache_file_collector=cache_file_collector,
172
- )
151
+ # If the full dataset has been tokenized, we don't want to tokenize the cohort containing
152
+ # the subset of the data. We should slice out the portion of the tokenized sequences for each sample
153
+ if cehrgpt_args.tokenized_full_dataset_path is not None:
154
+ processed_dataset = extract_cohort_sequences(
155
+ data_args, cehrgpt_args, cache_file_collector
156
+ )
157
+ else:
158
+ # Organize them into a single DatasetDict
159
+ final_splits = prepare_finetune_dataset(
160
+ data_args, training_args, cehrgpt_args, cache_file_collector
161
+ )
162
+ if cehrgpt_args.expand_tokenizer:
163
+ new_tokenizer_path = os.path.expanduser(training_args.output_dir)
164
+ if tokenizer_exists(new_tokenizer_path):
165
+ cehrgpt_tokenizer = CehrGptTokenizer.from_pretrained(
166
+ new_tokenizer_path
167
+ )
168
+ else:
169
+ cehrgpt_tokenizer = CehrGptTokenizer.expand_trained_tokenizer(
170
+ cehrgpt_tokenizer=cehrgpt_tokenizer,
171
+ dataset=final_splits["train"],
172
+ data_args=data_args,
173
+ concept_name_mapping={},
174
+ )
175
+ cehrgpt_tokenizer.save_pretrained(
176
+ os.path.expanduser(training_args.output_dir)
177
+ )
178
+
179
+ # TODO: temp solution, this column is mixed typed and causes an issue when transforming the data
180
+ if not data_args.streaming:
181
+ all_columns = final_splits["train"].column_names
182
+ if "visit_concept_ids" in all_columns:
183
+ final_splits = final_splits.remove_columns(
184
+ ["visit_concept_ids"]
185
+ )
186
+
187
+ processed_dataset = create_cehrgpt_finetuning_dataset(
188
+ dataset=final_splits,
189
+ cehrgpt_tokenizer=cehrgpt_tokenizer,
190
+ data_args=data_args,
191
+ cache_file_collector=cache_file_collector,
192
+ )
173
193
  if not data_args.streaming:
174
194
  processed_dataset.save_to_disk(prepared_ds_path)
175
195
  processed_dataset.cleanup_cache_files()
@@ -244,6 +264,7 @@ def main():
244
264
  SamplePackingCehrGptDataCollator,
245
265
  cehrgpt_args.max_tokens_per_batch,
246
266
  cehrgpt_model.config.max_position_embeddings,
267
+ add_end_token_in_sample_packing=cehrgpt_args.add_end_token_in_sample_packing,
247
268
  )
248
269
  train_batch_sampler = SamplePackingBatchSampler(
249
270
  lengths=train_set["num_of_concepts"],
@@ -278,6 +299,7 @@ def main():
278
299
  include_ttv_prediction=False,
279
300
  use_sub_time_tokenization=False,
280
301
  include_demographics=cehrgpt_args.include_demographics,
302
+ add_linear_prob_token=True,
281
303
  )
282
304
 
283
305
  train_loader = DataLoader(
@@ -298,30 +320,36 @@ def main():
298
320
  batch_sampler=test_batch_sampler,
299
321
  )
300
322
 
301
- # Loading demographics
302
- print("Loading demographics as a dictionary")
303
- demographics_df = pd.concat(
304
- [
305
- pd.read_parquet(
306
- data_dir,
307
- columns=[
308
- "person_id",
309
- "index_date",
310
- "gender_concept_id",
311
- "race_concept_id",
312
- ],
313
- )
314
- for data_dir in [data_args.data_folder, data_args.test_data_folder]
315
- ]
316
- )
317
- demographics_df["index_date"] = demographics_df.index_date.dt.date
318
- demographics_dict = {
319
- (row["person_id"], row["index_date"]): {
320
- "gender_concept_id": row["gender_concept_id"],
321
- "race_concept_id": row["race_concept_id"],
323
+ if data_args.is_data_in_meds:
324
+ demographics_dict = dict()
325
+ else:
326
+ # Loading demographics
327
+ print("Loading demographics as a dictionary")
328
+ demographics_df = pd.concat(
329
+ [
330
+ pd.read_parquet(
331
+ data_dir,
332
+ columns=[
333
+ "person_id",
334
+ "index_date",
335
+ "gender_concept_id",
336
+ "race_concept_id",
337
+ ],
338
+ )
339
+ for data_dir in [data_args.data_folder, data_args.test_data_folder]
340
+ ]
341
+ )
342
+ # This is a pre-caution in case the index_date is not a datetime type
343
+ demographics_df["index_date"] = pd.to_datetime(
344
+ demographics_df["index_date"]
345
+ ).dt.date
346
+ demographics_dict = {
347
+ (row["person_id"], row["index_date"]): {
348
+ "gender_concept_id": row["gender_concept_id"],
349
+ "race_concept_id": row["race_concept_id"],
350
+ }
351
+ for _, row in demographics_df.iterrows()
322
352
  }
323
- for _, row in demographics_df.iterrows()
324
- }
325
353
 
326
354
  data_loaders = [("train", train_loader), ("test", test_dataloader)]
327
355
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cehrgpt
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: CEHR-GPT: Generating Electronic Health Records with Chronological Patient Timelines
5
5
  Author-email: Chao Pang <chaopang229@gmail.com>, Xinzhuo Jiang <xj2193@cumc.columbia.edu>, Krishna Kalluri <kk3326@cumc.columbia.edu>, Elise Minto <em3697@cumc.columbia.edu>, Jason Patterson <jp3477@cumc.columbia.edu>, Nishanth Parameshwar Pavinkurve <np2689@cumc.columbia.edu>, Karthik Natarajan <kn2174@cumc.columbia.edu>
6
6
  License: MIT License
@@ -12,14 +12,15 @@ Classifier: Programming Language :: Python :: 3
12
12
  Requires-Python: >=3.10.0
13
13
  Description-Content-Type: text/markdown
14
14
  License-File: LICENSE
15
- Requires-Dist: cehrbert==1.4.1
16
- Requires-Dist: cehrbert_data==0.0.7
15
+ Requires-Dist: cehrbert==1.4.5
16
+ Requires-Dist: cehrbert_data==0.0.11
17
17
  Requires-Dist: openai==1.54.3
18
18
  Requires-Dist: optuna==4.0.0
19
- Requires-Dist: transformers==4.44.0
19
+ Requires-Dist: transformers==4.44.1
20
20
  Requires-Dist: tokenizers==0.19.0
21
21
  Requires-Dist: peft==0.10.0
22
22
  Requires-Dist: lightgbm
23
+ Requires-Dist: polars
23
24
  Provides-Extra: dev
24
25
  Requires-Dist: pre-commit; extra == "dev"
25
26
  Requires-Dist: pytest; extra == "dev"
@@ -36,9 +37,9 @@ Dynamic: license-file
36
37
 
37
38
  [![PyPI - Version](https://img.shields.io/pypi/v/cehrgpt)](https://pypi.org/project/cehrgpt/)
38
39
  ![Python](https://img.shields.io/badge/-Python_3.11-blue?logo=python&logoColor=white)
39
- [![tests](https://github.com/knatarajan-lab/cehrgpt-public/actions/workflows/tests.yaml/badge.svg)](https://github.com/knatarajan-lab/cehrgpt-public/actions/workflows/tests.yml)
40
- [![license](https://img.shields.io/badge/License-MIT-green.svg?labelColor=gray)](https://github.com/knatarajan-lab/cehrgpt-public/blob/main/LICENSE)
41
- [![contributors](https://img.shields.io/github/contributors/knatarajan-lab/cehrgpt-public.svg)](https://github.com/knatarajan-lab/cehrgpt-public/graphs/contributors)
40
+ [![tests](https://github.com/knatarajan-lab/cehrgpt/actions/workflows/tests.yaml/badge.svg)](https://github.com/knatarajan-lab/cehrgpt/actions/workflows/tests.yaml)
41
+ [![license](https://img.shields.io/badge/License-MIT-green.svg?labelColor=gray)](https://github.com/knatarajan-lab/cehrgpt/blob/main/LICENSE)
42
+ [![contributors](https://img.shields.io/github/contributors/knatarajan-lab/cehrgpt.svg)](https://github.com/knatarajan-lab/cehrgpt/graphs/contributors)
42
43
 
43
44
  ## Description
44
45
  CEHRGPT is a synthetic data generation model developed to handle structured electronic health records (EHR) with enhanced privacy and reliability. It leverages state-of-the-art natural language processing techniques to create realistic, anonymized patient data that can be used for research and development without compromising patient privacy.
@@ -1,8 +1,9 @@
1
1
  __init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  cehrgpt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  cehrgpt/cehrgpt_args.py,sha256=zPLp9Qjlq5PapWx3R15BNnyaX8zV3dxr4PuWj71r0Lg,3516
4
- cehrgpt/gpt_utils.py,sha256=bksHCXMX4j859VSv1Q284rVr4gn1Y8dCx4a_V-g4mug,10939
4
+ cehrgpt/gpt_utils.py,sha256=IA5qw-hxcKkGO07AB47lDNRU6mlb9jblpKO7KeLLN78,11342
5
5
  cehrgpt/analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ cehrgpt/analysis/irregularity.py,sha256=Rfl_daMvSh9cZ68vUwfmuH-JYCFXdAph2ITHHffYC0Y,1047
6
7
  cehrgpt/analysis/privacy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
8
  cehrgpt/analysis/privacy/attribute_inference.py,sha256=0ANVW0I5uvOl6IxQ15-vMVQd0mugOgSGReBUQQESImg,9368
8
9
  cehrgpt/analysis/privacy/attribute_inference_config.yml,sha256=hfLfpBlDqqsNOynpRHK414vV24edKA6ta-inmEhM2ao,103272
@@ -11,22 +12,22 @@ cehrgpt/analysis/privacy/nearest_neighbor_inference.py,sha256=qoJgWW7VsUMzjMGpTa
11
12
  cehrgpt/analysis/privacy/reid_inference.py,sha256=Pypd3QJXQNY8VljpnIEa5zeAbTZHMjQOazaL-9VsBGw,13955
12
13
  cehrgpt/analysis/privacy/utils.py,sha256=CRA4H9mPLBjMQGKzZ_x_3ro3tMap-NjsMDVqSOjHSVQ,8226
13
14
  cehrgpt/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
- cehrgpt/data/hf_cehrgpt_dataset.py,sha256=t9vpN05e--CiKgIlxLP0aLacISnvWWDPXtuFuJi3ksE,3736
15
- cehrgpt/data/hf_cehrgpt_dataset_collator.py,sha256=DOvIF4Wzkd8-IO3zpIRZkX1j0IdvefaiSnrDn1YivCk,27912
16
- cehrgpt/data/hf_cehrgpt_dataset_mapping.py,sha256=eI8CTk6yJ4DlNJWrNAkEmhWh353NeLqg5rwPpKqKT-U,17308
17
- cehrgpt/data/sample_packing_sampler.py,sha256=0uKTbvtXpfS81esy_3epJ88eohyJPK46bfmxhle1fws,5419
15
+ cehrgpt/data/hf_cehrgpt_dataset.py,sha256=hwJlGW7XiJIr6cXtmwvReQf9yLZJPD-dvJGvRg5ERqU,3755
16
+ cehrgpt/data/hf_cehrgpt_dataset_collator.py,sha256=ACMXiaYnR3bKD5dRleL0_siEvhL-2HAFcy5eBgvxnH4,44412
17
+ cehrgpt/data/hf_cehrgpt_dataset_mapping.py,sha256=KU0WMjc2vT1zBAl7JJkOc8dgGxsL1uFDy4dDrv-RkII,25668
18
+ cehrgpt/data/sample_packing_sampler.py,sha256=vovGMtmhG70DRkSCeiaDEJ_rjKZ38y-YLaI1kkhFEkI,6747
18
19
  cehrgpt/generation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
20
  cehrgpt/generation/chatgpt_generation.py,sha256=SrnLwHLdNtnAOEg36gNjqfoT9yd12iyPgpZffL2AFJo,4428
20
21
  cehrgpt/generation/generate_batch_hf_gpt_sequence.py,sha256=uSEh8aMmPD61nGewIaPSkIqm-2AxDjCBiu4cBfxHxU4,11503
21
- cehrgpt/generation/omop_converter_batch.py,sha256=-c0AlDVy5pJ5Afhr8ERiCHhoRrEk8ozJi3g0yFdWaMI,25348
22
+ cehrgpt/generation/omop_converter_batch.py,sha256=LUmCD-t_6ZP1YfNDZCqYewl-XIIaIgRZ_dAxuR_VdCQ,26275
22
23
  cehrgpt/generation/omop_entity.py,sha256=Q5Sr0AlyuPAm1FRPfnJO13q-u1fqRgYVHXruZ9g4xNE,19400
23
24
  cehrgpt/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
- cehrgpt/models/config.py,sha256=Y3CiXZWniLP9_RlpU80Oe9gjn5leLmTYnNe_fWqfJLQ,10158
25
- cehrgpt/models/hf_cehrgpt.py,sha256=3EQIOfa--oz4f8bM8KzbDi98G3XrUEQkox1vmBN001M,83321
26
- cehrgpt/models/hf_modeling_outputs.py,sha256=LaWa1jI6BRIKMEjWOy1QUeOfTur5y_p2c-JyuGVTdtw,10301
25
+ cehrgpt/models/config.py,sha256=nOAKgH5420HLCcy7n1hE7MbqR861Iq4DTutKoAd25tg,11090
26
+ cehrgpt/models/hf_cehrgpt.py,sha256=77CAkdMPgxD4xSpFU7gYGzRn6_Iv-4q7FnHpnZGsKxw,92450
27
+ cehrgpt/models/hf_modeling_outputs.py,sha256=5X4WEYKqT37phv_e5ZAv3A_N0wqdAUJLJRm6TxS6dDQ,10356
27
28
  cehrgpt/models/pretrained_embeddings.py,sha256=vLLVs17TLpXRqCVEWQxGGwPHkUJUO7laNTeBuyBK_yk,3238
28
- cehrgpt/models/special_tokens.py,sha256=-a7HPJBbdIH0qQ6B3CcRKqvpG6FZlm4nbVPTswGSJ4U,485
29
- cehrgpt/models/tokenization_hf_cehrgpt.py,sha256=jjCRqS29IzMnKp40jNOs80UKh2z9lK5S6M02GSB-4mk,42351
29
+ cehrgpt/models/special_tokens.py,sha256=lrw45B4tea4Dsajn09Cz6w5D2TfHmYXikZkgwnstu_o,521
30
+ cehrgpt/models/tokenization_hf_cehrgpt.py,sha256=cAxHTctpVBxfWfC3XcwDQavN1zwWN9Nid_Fajd5zQWQ,53159
30
31
  cehrgpt/omop/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
32
  cehrgpt/omop/condition_era.py,sha256=hPZALz2XaWnro_1bwIYNkI48foOJjueyg3CZ1BliCno,626
32
33
  cehrgpt/omop/observation_period.py,sha256=TRMgv5Ya2RaS2im7oQ6BLC_5JL9EJYNYR62ApxIuHvg,1211
@@ -37,22 +38,23 @@ cehrgpt/omop/queries/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
37
38
  cehrgpt/omop/queries/condition_era.py,sha256=LFB6vBAvshHJxtYIRkl7cfrF0kf7ay0piBKpmHBwrpE,2578
38
39
  cehrgpt/omop/queries/observation_period.py,sha256=fpzr5DMNw-QLoSwp2Iatfch88E3hyhZ75usiIdG3A0U,6410
39
40
  cehrgpt/runners/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
- cehrgpt/runners/data_utils.py,sha256=ScZZnfXwgXKaMvKgFzdb4vtQ7F_lw97O5uNsFbfsyP4,10620
41
+ cehrgpt/runners/data_utils.py,sha256=I6k1TkiiZR8ggw3eVO16g2lVPY-Hu3b-nbrIOKlFIO0,15528
41
42
  cehrgpt/runners/gpt_runner_util.py,sha256=YJQSRW9Mo4TjXSOUOTf6BUFcs1MGFiXU5T4ztKZcYhU,3485
42
- cehrgpt/runners/hf_cehrgpt_finetune_runner.py,sha256=bkPl30Y9CSXBlmMkH-3cA3-aW8XJK36Q-adx___WjkE,26921
43
- cehrgpt/runners/hf_cehrgpt_pretrain_runner.py,sha256=ViVa_flEGdk_SO0psMR7ho-o79igsz_l1x80u81WJ3A,23875
44
- cehrgpt/runners/hf_gpt_runner_argument_dataclass.py,sha256=VrqgDSiAMfGyHEIodoOg_8LU5O0ndWf9EE0YOKDFKKA,7019
45
- cehrgpt/runners/hyperparameter_search_util.py,sha256=pWFmGo9Ezju4YmuZ-ohbAbYB0GGMfIDVUCyvcTxS1iU,9153
46
- cehrgpt/runners/sample_packing_trainer.py,sha256=aezX30vxpP1DDcH5hO-yn395NqBKi2Xhb0mFNHi9OBs,7340
43
+ cehrgpt/runners/hf_cehrgpt_finetune_runner.py,sha256=GVbHHqf5TWGbVWlQG-XurgYH8pKRjTk8ug_ib9L9U7E,28118
44
+ cehrgpt/runners/hf_cehrgpt_pretrain_runner.py,sha256=ERSnvB38fPYVghtKQeNTZ8VfeXnoRcCHB0cWISWaZ84,26523
45
+ cehrgpt/runners/hf_gpt_runner_argument_dataclass.py,sha256=ejAFLM9g765p1fyeF5MITsiIeWHKkz9wTeFDeVgxSto,8851
46
+ cehrgpt/runners/hyperparameter_search_util.py,sha256=YWdFQ1igQs-G_wqWUrUzYraGiz8OSpSYyvid-I5nhWA,9262
47
+ cehrgpt/runners/sample_packing_trainer.py,sha256=Zb7Aqwnk8-VqrjEKUVeg5XzZWmHxXOU2sDn1YURS-FU,7960
47
48
  cehrgpt/simulations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
48
49
  cehrgpt/simulations/generate_plots.py,sha256=BTZ71r8Kah0PMorkiO3vw55_p_9U1Z8KiD3GsPfaV0s,2520
49
50
  cehrgpt/simulations/run_simulation.sh,sha256=DcJ6B19jIteUO0pZ0Tc21876lB9XxQHFAxlre7MtAzk,795
50
51
  cehrgpt/simulations/time_embedding_simulation.py,sha256=HZ-imXH-bN-QYZN1PAfcERmNtaWIwKjbf0UrZduwCiA,8687
51
52
  cehrgpt/simulations/time_token_simulation.py,sha256=sLg8vVXydvR_zk3BbqyrlA7sDIdhFnS-s5pSKcCilSc,6057
52
53
  cehrgpt/time_to_event/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
- cehrgpt/time_to_event/time_to_event_model.py,sha256=tfXa24l_0q1TBZ68BPRrHRC_3KRWYxrWGIv4myJlIb8,8497
54
- cehrgpt/time_to_event/time_to_event_prediction.py,sha256=Ajesq2gSsILghWHCTLiiBhcyOCa7m6JPPMdi_xvBlR4,12624
54
+ cehrgpt/time_to_event/time_to_event_model.py,sha256=Plm0bZxvlAbnMl82DTBXWvaXLvrqcdkzcP_celX8WC4,8055
55
+ cehrgpt/time_to_event/time_to_event_prediction.py,sha256=W2e7UqIV7ELdfTy997HS66vggjnhdncCKt840knI0Dw,13183
55
56
  cehrgpt/time_to_event/time_to_event_utils.py,sha256=KN4hwGgxy2nJtO7osbYQBF3-HpmGUWefNfexzPYiEwc,1937
57
+ cehrgpt/time_to_event/config/1_year_cabg.yaml,sha256=SFF2-F5D02pDSMRddDrEUoERBCd0t2Hzln_xC-Mo2hA,407
56
58
  cehrgpt/time_to_event/config/30_day_readmission.yaml,sha256=Hn5KnEXMtSV_CtCpmAU4wjkc0-gTXvniaH991TSbUXA,234
57
59
  cehrgpt/time_to_event/config/next_visit_type_prediction.yaml,sha256=WMj2ZutEvHKIMyGG51xtXaL6MyRANKvpg9xT8ouctLc,319
58
60
  cehrgpt/time_to_event/config/t2dm_hf.yaml,sha256=_oMQzh2eJTYzEaMOpmhAzbX-qmdsKlkORELL6HxOxHo,202
@@ -63,10 +65,10 @@ cehrgpt/tools/generate_pretrained_embeddings.py,sha256=lhFSacGv8bMld6qigKZN8Op8e
63
65
  cehrgpt/tools/merge_synthetic_real_dataasets.py,sha256=O1dbQ32Le0t15fwymwAh9mfNVLEWuFwW53DNvESrWbY,7589
64
66
  cehrgpt/tools/upload_omop_tables.py,sha256=vdBAbkeAsGPA4NsyhNjelPVj3gS8yzmS1sKNM1Qk96g,3791
65
67
  cehrgpt/tools/linear_prob/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
66
- cehrgpt/tools/linear_prob/compute_cehrgpt_features.py,sha256=jVgAmBrZKp7ABfqKkzwV5Vl_G9jDCjPl98NSVmSwHpE,19291
68
+ cehrgpt/tools/linear_prob/compute_cehrgpt_features.py,sha256=q0rmlBWDDEkjHjwcTouGUhCYa32a1vRicaDOAMsdW0I,20741
67
69
  cehrgpt/tools/linear_prob/train_with_cehrgpt_features.py,sha256=w0UvzMKYGenN_KDVnbzutmy8IPLUxW5hPvpKKxDSL5U,5820
68
- cehrgpt-0.1.0.dist-info/licenses/LICENSE,sha256=LOfC32zkfUIdGm8e_098jPbt8OHKtNWymDzxn2pA9Zk,1093
69
- cehrgpt-0.1.0.dist-info/METADATA,sha256=V02vsptjJRD_bybXVRFXPrJa-By9CX4j-oAA3EfXFq4,4933
70
- cehrgpt-0.1.0.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
71
- cehrgpt-0.1.0.dist-info/top_level.txt,sha256=akNCJBbMSLV8nkOzdVzdy13hMJ5CIQURnAS_YYEDVwA,17
72
- cehrgpt-0.1.0.dist-info/RECORD,,
70
+ cehrgpt-0.1.1.dist-info/licenses/LICENSE,sha256=LOfC32zkfUIdGm8e_098jPbt8OHKtNWymDzxn2pA9Zk,1093
71
+ cehrgpt-0.1.1.dist-info/METADATA,sha256=VnXH74vJQZaV7VxGiIvJnFhQA0jzJQNx86yHFkygobM,4922
72
+ cehrgpt-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
73
+ cehrgpt-0.1.1.dist-info/top_level.txt,sha256=akNCJBbMSLV8nkOzdVzdy13hMJ5CIQURnAS_YYEDVwA,17
74
+ cehrgpt-0.1.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.7.1)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5