cehrgpt 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. __init__.py +0 -0
  2. cehrgpt/__init__.py +0 -0
  3. cehrgpt/analysis/__init__.py +0 -0
  4. cehrgpt/analysis/privacy/__init__.py +0 -0
  5. cehrgpt/analysis/privacy/attribute_inference.py +275 -0
  6. cehrgpt/analysis/privacy/attribute_inference_config.yml +8975 -0
  7. cehrgpt/analysis/privacy/member_inference.py +172 -0
  8. cehrgpt/analysis/privacy/nearest_neighbor_inference.py +189 -0
  9. cehrgpt/analysis/privacy/reid_inference.py +407 -0
  10. cehrgpt/analysis/privacy/utils.py +255 -0
  11. cehrgpt/cehrgpt_args.py +142 -0
  12. cehrgpt/data/__init__.py +0 -0
  13. cehrgpt/data/hf_cehrgpt_dataset.py +80 -0
  14. cehrgpt/data/hf_cehrgpt_dataset_collator.py +482 -0
  15. cehrgpt/data/hf_cehrgpt_dataset_mapping.py +116 -0
  16. cehrgpt/generation/__init__.py +0 -0
  17. cehrgpt/generation/chatgpt_generation.py +106 -0
  18. cehrgpt/generation/generate_batch_hf_gpt_sequence.py +333 -0
  19. cehrgpt/generation/omop_converter_batch.py +644 -0
  20. cehrgpt/generation/omop_entity.py +515 -0
  21. cehrgpt/gpt_utils.py +331 -0
  22. cehrgpt/models/__init__.py +0 -0
  23. cehrgpt/models/config.py +205 -0
  24. cehrgpt/models/hf_cehrgpt.py +1817 -0
  25. cehrgpt/models/hf_modeling_outputs.py +158 -0
  26. cehrgpt/models/pretrained_embeddings.py +82 -0
  27. cehrgpt/models/special_tokens.py +30 -0
  28. cehrgpt/models/tokenization_hf_cehrgpt.py +1077 -0
  29. cehrgpt/omop/__init__.py +0 -0
  30. cehrgpt/omop/condition_era.py +20 -0
  31. cehrgpt/omop/observation_period.py +43 -0
  32. cehrgpt/omop/omop_argparse.py +38 -0
  33. cehrgpt/omop/omop_table_builder.py +86 -0
  34. cehrgpt/omop/queries/__init__.py +0 -0
  35. cehrgpt/omop/queries/condition_era.py +86 -0
  36. cehrgpt/omop/queries/observation_period.py +135 -0
  37. cehrgpt/omop/sample_omop_tables.py +71 -0
  38. cehrgpt/runners/__init__.py +0 -0
  39. cehrgpt/runners/gpt_runner_util.py +99 -0
  40. cehrgpt/runners/hf_cehrgpt_finetune_runner.py +746 -0
  41. cehrgpt/runners/hf_cehrgpt_pretrain_runner.py +370 -0
  42. cehrgpt/runners/hf_gpt_runner_argument_dataclass.py +137 -0
  43. cehrgpt/runners/hyperparameter_search_util.py +223 -0
  44. cehrgpt/time_to_event/__init__.py +0 -0
  45. cehrgpt/time_to_event/config/30_day_readmission.yaml +8 -0
  46. cehrgpt/time_to_event/config/next_visit_type_prediction.yaml +8 -0
  47. cehrgpt/time_to_event/config/t2dm_hf.yaml +8 -0
  48. cehrgpt/time_to_event/time_to_event_model.py +226 -0
  49. cehrgpt/time_to_event/time_to_event_prediction.py +347 -0
  50. cehrgpt/time_to_event/time_to_event_utils.py +55 -0
  51. cehrgpt/tools/__init__.py +0 -0
  52. cehrgpt/tools/ehrshot_benchmark.py +74 -0
  53. cehrgpt/tools/generate_pretrained_embeddings.py +130 -0
  54. cehrgpt/tools/merge_synthetic_real_dataasets.py +218 -0
  55. cehrgpt/tools/upload_omop_tables.py +108 -0
  56. cehrgpt-0.0.1.dist-info/LICENSE +21 -0
  57. cehrgpt-0.0.1.dist-info/METADATA +66 -0
  58. cehrgpt-0.0.1.dist-info/RECORD +60 -0
  59. cehrgpt-0.0.1.dist-info/WHEEL +5 -0
  60. cehrgpt-0.0.1.dist-info/top_level.txt +2 -0
@@ -0,0 +1,158 @@
1
+ from dataclasses import dataclass
2
+ from typing import Optional, Tuple
3
+
4
+ import torch
5
+ from transformers.modeling_outputs import ModelOutput
6
+
7
+
8
+ @dataclass
9
+ class CehrGptOutputWithPast(ModelOutput):
10
+ """
11
+ Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
12
+
13
+ Args:
14
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
15
+ Sequence of hidden-states at the output of the last layer of the model.
16
+
17
+ If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
18
+ hidden_size)` is output.
19
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
20
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
21
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
22
+ `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
23
+ encoder_sequence_length, embed_size_per_head)`.
24
+
25
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
26
+ `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
27
+ input) to speed up sequential decoding.
28
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
29
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
30
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
31
+
32
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
33
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
34
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
35
+ sequence_length)`.
36
+
37
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
38
+ heads.
39
+ """
40
+
41
+ last_hidden_state: torch.FloatTensor = None
42
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
43
+ hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
44
+ attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
45
+
46
+
47
+ @dataclass
48
+ class CehrGptCausalLMOutput(ModelOutput):
49
+ """
50
+ Base class for causal language model (or autoregressive) outputs.
51
+
52
+ Args:
53
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
54
+ Language modeling loss (for next-token prediction).
55
+ logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
56
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
57
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
58
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
59
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
60
+
61
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
62
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
63
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
64
+ sequence_length)`.
65
+
66
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
67
+ heads.
68
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
69
+ Tuple of `torch.FloatTensor` tuples of length `config.n_layers`, with each tuple containing the cached key,
70
+ value states of the self-attention and the cross-attention layers if model is used in encoder-decoder
71
+ setting. Only relevant if `config.is_decoder = True`.
72
+
73
+ Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
74
+ `past_key_values` input) to speed up sequential decoding.
75
+ """
76
+
77
+ loss: Optional[torch.FloatTensor] = None
78
+ logits: torch.FloatTensor = None
79
+ value_indicators: torch.BoolTensor = None
80
+ next_value_logits: torch.FloatTensor = None
81
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
82
+ hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
83
+ attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
84
+ token_loss: Optional[torch.FloatTensor] = None
85
+ time_token_loss: Optional[torch.FloatTensor] = None
86
+ time_to_visit_loss: Optional[torch.FloatTensor] = None
87
+ token_value_loss: Optional[torch.FloatTensor] = None
88
+
89
+
90
+ @dataclass
91
+ class CehrGptGenerateDecoderOnlyOutput(ModelOutput):
92
+ """
93
+ Outputs of decoder-only generation models, when using non-beam methods.
94
+
95
+ Args:
96
+ sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
97
+ The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
98
+ if all batches finished early due to the `eos_token_id`.
99
+ scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
100
+ Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
101
+ at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
102
+ each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
103
+ logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True` is passed or when `config.output_logits=True`):
104
+ Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
105
+ at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
106
+ each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
107
+ attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
108
+ Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
109
+ `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
110
+ hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
111
+ Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
112
+ `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
113
+ past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
114
+ NOTE: some models have a different `past_key_values` format, confirm with the model's documentation.
115
+ Usually a Tuple (one element for each layer of the decoder) of tuples (two elements, key tensor and value
116
+ tensor). The first Tuple is of length `config.n_layers`, with each tuple having 2 tensors of shape
117
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
118
+ `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
119
+ encoder_sequence_length, embed_size_per_head)`.
120
+ """
121
+
122
+ sequences: torch.LongTensor = None
123
+ sequence_val_masks: Optional[torch.BoolTensor] = None
124
+ sequence_vals: Optional[torch.LongTensor] = None
125
+ scores: Optional[Tuple[torch.FloatTensor]] = None
126
+ logits: Optional[Tuple[torch.FloatTensor]] = None
127
+ attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
128
+ hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
129
+ past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None
130
+
131
+
132
+ @dataclass
133
+ class CehrGptSequenceClassifierOutput(ModelOutput):
134
+ """
135
+ Base class for outputs of sentence classification models.
136
+
137
+ Args:
138
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
139
+ Classification (or regression if config.num_labels==1) loss.
140
+ logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
141
+ Classification (or regression if config.num_labels==1) scores (before SoftMax).
142
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
143
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
144
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
145
+
146
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
147
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
148
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
149
+ sequence_length)`.
150
+
151
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
152
+ heads.
153
+ """
154
+
155
+ loss: Optional[torch.FloatTensor] = None
156
+ logits: torch.FloatTensor = None
157
+ hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
158
+ attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -0,0 +1,82 @@
1
+ import pickle
2
+ from pathlib import Path
3
+ from typing import Optional
4
+
5
+ import numpy as np
6
+
7
+ PRETRAINED_EMBEDDING_VECTOR_FILE_NAME = "pretrained_embedding_vectors.npy"
8
+ PRETRAINED_EMBEDDING_CONCEPT_FILE_NAME = "pretrained_embedding_concepts.pkl"
9
+
10
+
11
+ class PretrainedEmbeddings:
12
+ """A class to handle pretrained embedding vectors and their associated concepts."""
13
+
14
+ def __init__(self, model_folder: Optional[str]):
15
+ if model_folder:
16
+ model_path = Path(model_folder)
17
+ self.vector_file = model_path / PRETRAINED_EMBEDDING_VECTOR_FILE_NAME
18
+ self.concept_file = model_path / PRETRAINED_EMBEDDING_CONCEPT_FILE_NAME
19
+ self.exists = self.vector_file.exists() and self.concept_file.exists()
20
+ else:
21
+ self.exists = False
22
+ self._initialize_embeddings() if self.exists else self._initialize_empty()
23
+
24
+ def _initialize_embeddings(self):
25
+ """Load embeddings and associated concepts from files."""
26
+ self.pretrained_embeddings = np.load(self.vector_file)
27
+ with open(self.concept_file, "rb") as f:
28
+ self.pretrained_concepts = pickle.load(f)
29
+
30
+ self.concept_ids = [
31
+ concept["concept_id"] for concept in self.pretrained_concepts
32
+ ]
33
+ self.reverse_concept_id_map = {
34
+ concept_id: i for i, concept_id in enumerate(self.concept_ids)
35
+ }
36
+ self.concept_names = [
37
+ concept["concept_name"] for concept in self.pretrained_concepts
38
+ ]
39
+ self.embed_dim = self.pretrained_embeddings.shape[1]
40
+
41
+ assert len(self.pretrained_embeddings) == len(
42
+ self.pretrained_concepts
43
+ ), "The number of embeddings does not match the number of concepts."
44
+
45
+ def _initialize_empty(self):
46
+ """Initialize empty attributes for when files do not exist."""
47
+ self.pretrained_embeddings = None
48
+ self.pretrained_concepts = None
49
+ self.concept_ids = None
50
+ self.concept_names = None
51
+ self.reverse_concept_id_map = None
52
+ self.embed_dim = 0
53
+
54
+ @property
55
+ def vocab_size(self) -> int:
56
+ """Return the size of the vocabulary."""
57
+ return len(self.pretrained_embeddings) if self.exists else 0
58
+
59
+ def is_concept_available(self, concept_id: str) -> bool:
60
+ """Check if a given concept ID is available."""
61
+ return self.exists and concept_id in self.concept_ids
62
+
63
+ def get_concept_embeddings(self, concept_id: str) -> Optional[np.ndarray]:
64
+ """
65
+ Retrieve the embedding vector for a given concept ID.
66
+
67
+ Returns None if the concept ID is not available.
68
+ """
69
+ if self.is_concept_available(concept_id):
70
+ return self.pretrained_embeddings[self.reverse_concept_id_map[concept_id]]
71
+ return None
72
+
73
+ def save(self, model_folder: str):
74
+ """Save the embeddings and concepts to the specified folder."""
75
+ if self.exists:
76
+ model_path = Path(model_folder)
77
+ np.save(
78
+ model_path / PRETRAINED_EMBEDDING_VECTOR_FILE_NAME,
79
+ self.pretrained_embeddings,
80
+ )
81
+ with open(model_path / PRETRAINED_EMBEDDING_CONCEPT_FILE_NAME, "wb") as f:
82
+ pickle.dump(self.pretrained_concepts, f)
@@ -0,0 +1,30 @@
1
+ # Special tokens for the cehr-gpt tokenizer
2
+ START_TOKEN = "[START]"
3
+ END_TOKEN = "[END]"
4
+ PAD_TOKEN = "[PAD]"
5
+ OUT_OF_VOCABULARY_TOKEN = "[OOV]"
6
+
7
+ # OMOP CONCEPT IDs
8
+ VISIT_CONCEPT_IDS = [
9
+ "9202",
10
+ "9203",
11
+ "581477",
12
+ "9201",
13
+ "5083",
14
+ "262",
15
+ "38004250",
16
+ "0",
17
+ "8883",
18
+ "38004238",
19
+ "38004251",
20
+ "38004222",
21
+ "38004268",
22
+ "38004228",
23
+ "32693",
24
+ "8971",
25
+ "38004269",
26
+ "38004193",
27
+ "32036",
28
+ "8782",
29
+ ]
30
+ DISCHARGE_CONCEPT_IDS = []