cehrgpt 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- __init__.py +0 -0
- cehrgpt/__init__.py +0 -0
- cehrgpt/analysis/__init__.py +0 -0
- cehrgpt/analysis/privacy/__init__.py +0 -0
- cehrgpt/analysis/privacy/attribute_inference.py +275 -0
- cehrgpt/analysis/privacy/attribute_inference_config.yml +8975 -0
- cehrgpt/analysis/privacy/member_inference.py +172 -0
- cehrgpt/analysis/privacy/nearest_neighbor_inference.py +189 -0
- cehrgpt/analysis/privacy/reid_inference.py +407 -0
- cehrgpt/analysis/privacy/utils.py +255 -0
- cehrgpt/cehrgpt_args.py +142 -0
- cehrgpt/data/__init__.py +0 -0
- cehrgpt/data/hf_cehrgpt_dataset.py +80 -0
- cehrgpt/data/hf_cehrgpt_dataset_collator.py +482 -0
- cehrgpt/data/hf_cehrgpt_dataset_mapping.py +116 -0
- cehrgpt/generation/__init__.py +0 -0
- cehrgpt/generation/chatgpt_generation.py +106 -0
- cehrgpt/generation/generate_batch_hf_gpt_sequence.py +333 -0
- cehrgpt/generation/omop_converter_batch.py +644 -0
- cehrgpt/generation/omop_entity.py +515 -0
- cehrgpt/gpt_utils.py +331 -0
- cehrgpt/models/__init__.py +0 -0
- cehrgpt/models/config.py +205 -0
- cehrgpt/models/hf_cehrgpt.py +1817 -0
- cehrgpt/models/hf_modeling_outputs.py +158 -0
- cehrgpt/models/pretrained_embeddings.py +82 -0
- cehrgpt/models/special_tokens.py +30 -0
- cehrgpt/models/tokenization_hf_cehrgpt.py +1077 -0
- cehrgpt/omop/__init__.py +0 -0
- cehrgpt/omop/condition_era.py +20 -0
- cehrgpt/omop/observation_period.py +43 -0
- cehrgpt/omop/omop_argparse.py +38 -0
- cehrgpt/omop/omop_table_builder.py +86 -0
- cehrgpt/omop/queries/__init__.py +0 -0
- cehrgpt/omop/queries/condition_era.py +86 -0
- cehrgpt/omop/queries/observation_period.py +135 -0
- cehrgpt/omop/sample_omop_tables.py +71 -0
- cehrgpt/runners/__init__.py +0 -0
- cehrgpt/runners/gpt_runner_util.py +99 -0
- cehrgpt/runners/hf_cehrgpt_finetune_runner.py +746 -0
- cehrgpt/runners/hf_cehrgpt_pretrain_runner.py +370 -0
- cehrgpt/runners/hf_gpt_runner_argument_dataclass.py +137 -0
- cehrgpt/runners/hyperparameter_search_util.py +223 -0
- cehrgpt/time_to_event/__init__.py +0 -0
- cehrgpt/time_to_event/config/30_day_readmission.yaml +8 -0
- cehrgpt/time_to_event/config/next_visit_type_prediction.yaml +8 -0
- cehrgpt/time_to_event/config/t2dm_hf.yaml +8 -0
- cehrgpt/time_to_event/time_to_event_model.py +226 -0
- cehrgpt/time_to_event/time_to_event_prediction.py +347 -0
- cehrgpt/time_to_event/time_to_event_utils.py +55 -0
- cehrgpt/tools/__init__.py +0 -0
- cehrgpt/tools/ehrshot_benchmark.py +74 -0
- cehrgpt/tools/generate_pretrained_embeddings.py +130 -0
- cehrgpt/tools/merge_synthetic_real_dataasets.py +218 -0
- cehrgpt/tools/upload_omop_tables.py +108 -0
- cehrgpt-0.0.1.dist-info/LICENSE +21 -0
- cehrgpt-0.0.1.dist-info/METADATA +66 -0
- cehrgpt-0.0.1.dist-info/RECORD +60 -0
- cehrgpt-0.0.1.dist-info/WHEEL +5 -0
- cehrgpt-0.0.1.dist-info/top_level.txt +2 -0
@@ -0,0 +1,158 @@
|
|
1
|
+
from dataclasses import dataclass
|
2
|
+
from typing import Optional, Tuple
|
3
|
+
|
4
|
+
import torch
|
5
|
+
from transformers.modeling_outputs import ModelOutput
|
6
|
+
|
7
|
+
|
8
|
+
@dataclass
|
9
|
+
class CehrGptOutputWithPast(ModelOutput):
|
10
|
+
"""
|
11
|
+
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
|
12
|
+
|
13
|
+
Args:
|
14
|
+
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
15
|
+
Sequence of hidden-states at the output of the last layer of the model.
|
16
|
+
|
17
|
+
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
|
18
|
+
hidden_size)` is output.
|
19
|
+
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
20
|
+
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
21
|
+
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
|
22
|
+
`config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
|
23
|
+
encoder_sequence_length, embed_size_per_head)`.
|
24
|
+
|
25
|
+
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
|
26
|
+
`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
|
27
|
+
input) to speed up sequential decoding.
|
28
|
+
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
29
|
+
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
30
|
+
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
31
|
+
|
32
|
+
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
33
|
+
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
34
|
+
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
35
|
+
sequence_length)`.
|
36
|
+
|
37
|
+
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
38
|
+
heads.
|
39
|
+
"""
|
40
|
+
|
41
|
+
last_hidden_state: torch.FloatTensor = None
|
42
|
+
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
|
43
|
+
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
|
44
|
+
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
|
45
|
+
|
46
|
+
|
47
|
+
@dataclass
|
48
|
+
class CehrGptCausalLMOutput(ModelOutput):
|
49
|
+
"""
|
50
|
+
Base class for causal language model (or autoregressive) outputs.
|
51
|
+
|
52
|
+
Args:
|
53
|
+
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
54
|
+
Language modeling loss (for next-token prediction).
|
55
|
+
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
|
56
|
+
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
57
|
+
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
58
|
+
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
59
|
+
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
60
|
+
|
61
|
+
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
62
|
+
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
63
|
+
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
64
|
+
sequence_length)`.
|
65
|
+
|
66
|
+
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
67
|
+
heads.
|
68
|
+
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
69
|
+
Tuple of `torch.FloatTensor` tuples of length `config.n_layers`, with each tuple containing the cached key,
|
70
|
+
value states of the self-attention and the cross-attention layers if model is used in encoder-decoder
|
71
|
+
setting. Only relevant if `config.is_decoder = True`.
|
72
|
+
|
73
|
+
Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
|
74
|
+
`past_key_values` input) to speed up sequential decoding.
|
75
|
+
"""
|
76
|
+
|
77
|
+
loss: Optional[torch.FloatTensor] = None
|
78
|
+
logits: torch.FloatTensor = None
|
79
|
+
value_indicators: torch.BoolTensor = None
|
80
|
+
next_value_logits: torch.FloatTensor = None
|
81
|
+
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
|
82
|
+
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
|
83
|
+
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
|
84
|
+
token_loss: Optional[torch.FloatTensor] = None
|
85
|
+
time_token_loss: Optional[torch.FloatTensor] = None
|
86
|
+
time_to_visit_loss: Optional[torch.FloatTensor] = None
|
87
|
+
token_value_loss: Optional[torch.FloatTensor] = None
|
88
|
+
|
89
|
+
|
90
|
+
@dataclass
|
91
|
+
class CehrGptGenerateDecoderOnlyOutput(ModelOutput):
|
92
|
+
"""
|
93
|
+
Outputs of decoder-only generation models, when using non-beam methods.
|
94
|
+
|
95
|
+
Args:
|
96
|
+
sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
97
|
+
The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
|
98
|
+
if all batches finished early due to the `eos_token_id`.
|
99
|
+
scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
|
100
|
+
Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
|
101
|
+
at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
|
102
|
+
each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
|
103
|
+
logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True` is passed or when `config.output_logits=True`):
|
104
|
+
Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
|
105
|
+
at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
|
106
|
+
each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
|
107
|
+
attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
|
108
|
+
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
|
109
|
+
`torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
|
110
|
+
hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
111
|
+
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
|
112
|
+
`torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
|
113
|
+
past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
114
|
+
NOTE: some models have a different `past_key_values` format, confirm with the model's documentation.
|
115
|
+
Usually a Tuple (one element for each layer of the decoder) of tuples (two elements, key tensor and value
|
116
|
+
tensor). The first Tuple is of length `config.n_layers`, with each tuple having 2 tensors of shape
|
117
|
+
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
|
118
|
+
`config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
|
119
|
+
encoder_sequence_length, embed_size_per_head)`.
|
120
|
+
"""
|
121
|
+
|
122
|
+
sequences: torch.LongTensor = None
|
123
|
+
sequence_val_masks: Optional[torch.BoolTensor] = None
|
124
|
+
sequence_vals: Optional[torch.LongTensor] = None
|
125
|
+
scores: Optional[Tuple[torch.FloatTensor]] = None
|
126
|
+
logits: Optional[Tuple[torch.FloatTensor]] = None
|
127
|
+
attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
|
128
|
+
hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
|
129
|
+
past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None
|
130
|
+
|
131
|
+
|
132
|
+
@dataclass
|
133
|
+
class CehrGptSequenceClassifierOutput(ModelOutput):
|
134
|
+
"""
|
135
|
+
Base class for outputs of sentence classification models.
|
136
|
+
|
137
|
+
Args:
|
138
|
+
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
|
139
|
+
Classification (or regression if config.num_labels==1) loss.
|
140
|
+
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
|
141
|
+
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
142
|
+
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
143
|
+
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
144
|
+
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
145
|
+
|
146
|
+
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
147
|
+
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
148
|
+
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
149
|
+
sequence_length)`.
|
150
|
+
|
151
|
+
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
152
|
+
heads.
|
153
|
+
"""
|
154
|
+
|
155
|
+
loss: Optional[torch.FloatTensor] = None
|
156
|
+
logits: torch.FloatTensor = None
|
157
|
+
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
|
158
|
+
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
|
@@ -0,0 +1,82 @@
|
|
1
|
+
import pickle
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
|
7
|
+
PRETRAINED_EMBEDDING_VECTOR_FILE_NAME = "pretrained_embedding_vectors.npy"
|
8
|
+
PRETRAINED_EMBEDDING_CONCEPT_FILE_NAME = "pretrained_embedding_concepts.pkl"
|
9
|
+
|
10
|
+
|
11
|
+
class PretrainedEmbeddings:
|
12
|
+
"""A class to handle pretrained embedding vectors and their associated concepts."""
|
13
|
+
|
14
|
+
def __init__(self, model_folder: Optional[str]):
|
15
|
+
if model_folder:
|
16
|
+
model_path = Path(model_folder)
|
17
|
+
self.vector_file = model_path / PRETRAINED_EMBEDDING_VECTOR_FILE_NAME
|
18
|
+
self.concept_file = model_path / PRETRAINED_EMBEDDING_CONCEPT_FILE_NAME
|
19
|
+
self.exists = self.vector_file.exists() and self.concept_file.exists()
|
20
|
+
else:
|
21
|
+
self.exists = False
|
22
|
+
self._initialize_embeddings() if self.exists else self._initialize_empty()
|
23
|
+
|
24
|
+
def _initialize_embeddings(self):
|
25
|
+
"""Load embeddings and associated concepts from files."""
|
26
|
+
self.pretrained_embeddings = np.load(self.vector_file)
|
27
|
+
with open(self.concept_file, "rb") as f:
|
28
|
+
self.pretrained_concepts = pickle.load(f)
|
29
|
+
|
30
|
+
self.concept_ids = [
|
31
|
+
concept["concept_id"] for concept in self.pretrained_concepts
|
32
|
+
]
|
33
|
+
self.reverse_concept_id_map = {
|
34
|
+
concept_id: i for i, concept_id in enumerate(self.concept_ids)
|
35
|
+
}
|
36
|
+
self.concept_names = [
|
37
|
+
concept["concept_name"] for concept in self.pretrained_concepts
|
38
|
+
]
|
39
|
+
self.embed_dim = self.pretrained_embeddings.shape[1]
|
40
|
+
|
41
|
+
assert len(self.pretrained_embeddings) == len(
|
42
|
+
self.pretrained_concepts
|
43
|
+
), "The number of embeddings does not match the number of concepts."
|
44
|
+
|
45
|
+
def _initialize_empty(self):
|
46
|
+
"""Initialize empty attributes for when files do not exist."""
|
47
|
+
self.pretrained_embeddings = None
|
48
|
+
self.pretrained_concepts = None
|
49
|
+
self.concept_ids = None
|
50
|
+
self.concept_names = None
|
51
|
+
self.reverse_concept_id_map = None
|
52
|
+
self.embed_dim = 0
|
53
|
+
|
54
|
+
@property
|
55
|
+
def vocab_size(self) -> int:
|
56
|
+
"""Return the size of the vocabulary."""
|
57
|
+
return len(self.pretrained_embeddings) if self.exists else 0
|
58
|
+
|
59
|
+
def is_concept_available(self, concept_id: str) -> bool:
|
60
|
+
"""Check if a given concept ID is available."""
|
61
|
+
return self.exists and concept_id in self.concept_ids
|
62
|
+
|
63
|
+
def get_concept_embeddings(self, concept_id: str) -> Optional[np.ndarray]:
|
64
|
+
"""
|
65
|
+
Retrieve the embedding vector for a given concept ID.
|
66
|
+
|
67
|
+
Returns None if the concept ID is not available.
|
68
|
+
"""
|
69
|
+
if self.is_concept_available(concept_id):
|
70
|
+
return self.pretrained_embeddings[self.reverse_concept_id_map[concept_id]]
|
71
|
+
return None
|
72
|
+
|
73
|
+
def save(self, model_folder: str):
|
74
|
+
"""Save the embeddings and concepts to the specified folder."""
|
75
|
+
if self.exists:
|
76
|
+
model_path = Path(model_folder)
|
77
|
+
np.save(
|
78
|
+
model_path / PRETRAINED_EMBEDDING_VECTOR_FILE_NAME,
|
79
|
+
self.pretrained_embeddings,
|
80
|
+
)
|
81
|
+
with open(model_path / PRETRAINED_EMBEDDING_CONCEPT_FILE_NAME, "wb") as f:
|
82
|
+
pickle.dump(self.pretrained_concepts, f)
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# Special tokens for the cehr-gpt tokenizer
|
2
|
+
START_TOKEN = "[START]"
|
3
|
+
END_TOKEN = "[END]"
|
4
|
+
PAD_TOKEN = "[PAD]"
|
5
|
+
OUT_OF_VOCABULARY_TOKEN = "[OOV]"
|
6
|
+
|
7
|
+
# OMOP CONCEPT IDs
|
8
|
+
VISIT_CONCEPT_IDS = [
|
9
|
+
"9202",
|
10
|
+
"9203",
|
11
|
+
"581477",
|
12
|
+
"9201",
|
13
|
+
"5083",
|
14
|
+
"262",
|
15
|
+
"38004250",
|
16
|
+
"0",
|
17
|
+
"8883",
|
18
|
+
"38004238",
|
19
|
+
"38004251",
|
20
|
+
"38004222",
|
21
|
+
"38004268",
|
22
|
+
"38004228",
|
23
|
+
"32693",
|
24
|
+
"8971",
|
25
|
+
"38004269",
|
26
|
+
"38004193",
|
27
|
+
"32036",
|
28
|
+
"8782",
|
29
|
+
]
|
30
|
+
DISCHARGE_CONCEPT_IDS = []
|