genhpf 1.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genhpf/__init__.py +9 -0
- genhpf/configs/__init__.py +23 -0
- genhpf/configs/config.yaml +8 -0
- genhpf/configs/configs.py +240 -0
- genhpf/configs/constants.py +29 -0
- genhpf/configs/initialize.py +58 -0
- genhpf/configs/utils.py +29 -0
- genhpf/criterions/__init__.py +74 -0
- genhpf/criterions/binary_cross_entropy.py +114 -0
- genhpf/criterions/binary_cross_entropy_with_logits.py +115 -0
- genhpf/criterions/criterion.py +87 -0
- genhpf/criterions/cross_entropy.py +202 -0
- genhpf/criterions/multi_task_criterion.py +177 -0
- genhpf/criterions/simclr_criterion.py +84 -0
- genhpf/criterions/wav2vec2_criterion.py +130 -0
- genhpf/datasets/__init__.py +84 -0
- genhpf/datasets/dataset.py +109 -0
- genhpf/datasets/genhpf_dataset.py +451 -0
- genhpf/datasets/meds_dataset.py +232 -0
- genhpf/loggings/__init__.py +0 -0
- genhpf/loggings/meters.py +374 -0
- genhpf/loggings/metrics.py +155 -0
- genhpf/loggings/progress_bar.py +445 -0
- genhpf/models/__init__.py +73 -0
- genhpf/models/genhpf.py +244 -0
- genhpf/models/genhpf_mlm.py +64 -0
- genhpf/models/genhpf_predictor.py +73 -0
- genhpf/models/genhpf_simclr.py +58 -0
- genhpf/models/genhpf_wav2vec2.py +304 -0
- genhpf/modules/__init__.py +15 -0
- genhpf/modules/gather_layer.py +23 -0
- genhpf/modules/grad_multiply.py +12 -0
- genhpf/modules/gumbel_vector_quantizer.py +204 -0
- genhpf/modules/identity_layer.py +8 -0
- genhpf/modules/layer_norm.py +27 -0
- genhpf/modules/positional_encoding.py +24 -0
- genhpf/scripts/__init__.py +0 -0
- genhpf/scripts/preprocess/__init__.py +0 -0
- genhpf/scripts/preprocess/genhpf/README.md +75 -0
- genhpf/scripts/preprocess/genhpf/__init__.py +0 -0
- genhpf/scripts/preprocess/genhpf/ehrs/__init__.py +36 -0
- genhpf/scripts/preprocess/genhpf/ehrs/ehr.py +919 -0
- genhpf/scripts/preprocess/genhpf/ehrs/eicu.py +550 -0
- genhpf/scripts/preprocess/genhpf/ehrs/mimiciii.py +839 -0
- genhpf/scripts/preprocess/genhpf/ehrs/mimiciv.py +619 -0
- genhpf/scripts/preprocess/genhpf/main.py +175 -0
- genhpf/scripts/preprocess/genhpf/manifest.py +79 -0
- genhpf/scripts/preprocess/genhpf/sample_dataset.py +177 -0
- genhpf/scripts/preprocess/genhpf/utils/__init__.py +3 -0
- genhpf/scripts/preprocess/genhpf/utils/utils.py +16 -0
- genhpf/scripts/preprocess/manifest.py +83 -0
- genhpf/scripts/preprocess/preprocess_meds.py +674 -0
- genhpf/scripts/test.py +264 -0
- genhpf/scripts/train.py +365 -0
- genhpf/trainer.py +370 -0
- genhpf/utils/checkpoint_utils.py +171 -0
- genhpf/utils/data_utils.py +130 -0
- genhpf/utils/distributed_utils.py +497 -0
- genhpf/utils/file_io.py +170 -0
- genhpf/utils/pdb.py +38 -0
- genhpf/utils/utils.py +204 -0
- genhpf-1.0.11.dist-info/LICENSE +21 -0
- genhpf-1.0.11.dist-info/METADATA +202 -0
- genhpf-1.0.11.dist-info/RECORD +67 -0
- genhpf-1.0.11.dist-info/WHEEL +5 -0
- genhpf-1.0.11.dist-info/entry_points.txt +6 -0
- genhpf-1.0.11.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# Integrated-EHR-Pipeline
|
|
2
|
+
- Pre-processing code refining project in GenHPF
|
|
3
|
+
|
|
4
|
+
## Install Requirements
|
|
5
|
+
- NOTE: This repository requires `python>=3.9` and `Java>=8`
|
|
6
|
+
```
|
|
7
|
+
pip install numpy pandas tqdm treelib transformers pyspark
|
|
8
|
+
```
|
|
9
|
+
|
|
10
|
+
## How to Use
|
|
11
|
+
```
|
|
12
|
+
main.py --ehr {eicu, mimiciii, mimiciv}
|
|
13
|
+
```
|
|
14
|
+
- It automatically download the corresponding dataset from physionet, but requires appropriate certification.
|
|
15
|
+
- You can also use the downloaded dataset with `--data {data path}` option
|
|
16
|
+
- You can check sample implementation of pytorch `dataset` on `sample_dataset.py`
|
|
17
|
+
|
|
18
|
+
### Arguments Description
|
|
19
|
+
- `--dest`: Output directory path (e.g., `--dest $HOME/output/mimiciv/`)
|
|
20
|
+
- `--ehr`: Type of EHR dataset (eicu/mimiciii/mimiciv)
|
|
21
|
+
- `--data`: Path to the input data directory (e.g., `--data $HOME/data/mimiciv/2.2/`)
|
|
22
|
+
- `--first_icu`: Process only first ICU admission
|
|
23
|
+
- `--emb_type`: Embedding type ("textbase" or "codebase")
|
|
24
|
+
- `--feature`: Feature selection mode ("all_features" or "select")
|
|
25
|
+
|
|
26
|
+
### Prediction Tasks
|
|
27
|
+
You can add any combination of the following prediction tasks:
|
|
28
|
+
- `--readmission`
|
|
29
|
+
- `--mortality`
|
|
30
|
+
- `--los_3day`
|
|
31
|
+
- `--los_7day`
|
|
32
|
+
- `--long_term_mortality`
|
|
33
|
+
- `--final_acuity`
|
|
34
|
+
- `--imminent_discharge`
|
|
35
|
+
- `--diagnosis`
|
|
36
|
+
- `--creatinine`
|
|
37
|
+
- `--bilirubin`
|
|
38
|
+
- `--platelets`
|
|
39
|
+
- `--wbc`
|
|
40
|
+
|
|
41
|
+
### Baseline Model Requirements
|
|
42
|
+
|
|
43
|
+
To reproduce the experiments from the GenHPF paper (including other baseline models), use the following configurations for data preprocessing:
|
|
44
|
+
- **GenHPF**: `--emb_type textbase --feature all_features`
|
|
45
|
+
- **SAND**: `--emb_type codebase --feature select`
|
|
46
|
+
- **DescEmb**: `--emb_type textbase --feature select`
|
|
47
|
+
- **Rajkomar**: `--emb_type textbase --feature all_features`
|
|
48
|
+
|
|
49
|
+
Example Command for GenHPF model data preparation:
|
|
50
|
+
```shell script
|
|
51
|
+
python3 main.py \
|
|
52
|
+
--dest $HOME/output/mimiciv/ \
|
|
53
|
+
--ehr mimiciv \
|
|
54
|
+
--data $HOME/data/mimiciv/2.2/ \
|
|
55
|
+
--emb_type textbase \
|
|
56
|
+
--feature all_features \
|
|
57
|
+
--first_icu \
|
|
58
|
+
--mortality --readmission # add desired prediction tasks
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### Cache Option
|
|
62
|
+
- `--cache`: Enable caching of intermediate processing results
|
|
63
|
+
- When enabled, the pipeline reuses previously processed data from cache instead of reprocessing
|
|
64
|
+
- Cache files are stored in `~/.cache/ehr` directory
|
|
65
|
+
- **Note**: Cache will use the previously processed task labels (mortality, readmission, etc.) from the cached results.
|
|
66
|
+
|
|
67
|
+
### Resource Requirements
|
|
68
|
+
- Full pipeline processing (all tables) for each dataset (MIMIC-III, MIMIC-IV, eICU) requires:
|
|
69
|
+
- ~180GB RAM
|
|
70
|
+
- ~6 hours on 128 cores (AMD EPYC 7502 32-Core Processor)
|
|
71
|
+
|
|
72
|
+
### External Resources
|
|
73
|
+
If automatic download fails, manually download these files and place them in the cache directory:
|
|
74
|
+
- CCS Diagnosis Codes: [ccs_multi_dx_tool_2015.csv](https://www.hcup-us.ahrq.gov/toolssoftware/ccs/Multi_Level_CCS_2015.zip)
|
|
75
|
+
- ICD Conversion: [icd10cmtoicd9gem.csv](https://data.nber.org/gem/icd10cmtoicd9gem.csv)
|
|
File without changes
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from .ehr import EHR
|
|
5
|
+
|
|
6
|
+
EHR_REGISTRY = {}
|
|
7
|
+
|
|
8
|
+
__all__ = "EHR"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def register_ehr(name):
|
|
12
|
+
def register_ehr(cls):
|
|
13
|
+
if name in EHR_REGISTRY:
|
|
14
|
+
raise ValueError("Cannot register duplicate EHR ({})".format(name))
|
|
15
|
+
EHR_REGISTRY[name] = cls
|
|
16
|
+
|
|
17
|
+
return cls
|
|
18
|
+
|
|
19
|
+
return register_ehr
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def import_ehrs(ehrs_dir, namespace):
|
|
23
|
+
for file in os.listdir(ehrs_dir):
|
|
24
|
+
path = os.path.join(ehrs_dir, file)
|
|
25
|
+
if (
|
|
26
|
+
not file.startswith("_")
|
|
27
|
+
and not file.startswith(".")
|
|
28
|
+
and (file.endswith(".py") or os.path.isdir(path))
|
|
29
|
+
):
|
|
30
|
+
ehrs_name = file[: file.find(".py")] if file.endswith(".py") else file
|
|
31
|
+
importlib.import_module(namespace + "." + ehrs_name)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# automatically import any Python files in the ehrs/ directory
|
|
35
|
+
ehrs_dir = os.path.dirname(__file__)
|
|
36
|
+
import_ehrs(ehrs_dir, "ehrs")
|