genhpf 1.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. genhpf/__init__.py +9 -0
  2. genhpf/configs/__init__.py +23 -0
  3. genhpf/configs/config.yaml +8 -0
  4. genhpf/configs/configs.py +240 -0
  5. genhpf/configs/constants.py +29 -0
  6. genhpf/configs/initialize.py +58 -0
  7. genhpf/configs/utils.py +29 -0
  8. genhpf/criterions/__init__.py +74 -0
  9. genhpf/criterions/binary_cross_entropy.py +114 -0
  10. genhpf/criterions/binary_cross_entropy_with_logits.py +115 -0
  11. genhpf/criterions/criterion.py +87 -0
  12. genhpf/criterions/cross_entropy.py +202 -0
  13. genhpf/criterions/multi_task_criterion.py +177 -0
  14. genhpf/criterions/simclr_criterion.py +84 -0
  15. genhpf/criterions/wav2vec2_criterion.py +130 -0
  16. genhpf/datasets/__init__.py +84 -0
  17. genhpf/datasets/dataset.py +109 -0
  18. genhpf/datasets/genhpf_dataset.py +451 -0
  19. genhpf/datasets/meds_dataset.py +232 -0
  20. genhpf/loggings/__init__.py +0 -0
  21. genhpf/loggings/meters.py +374 -0
  22. genhpf/loggings/metrics.py +155 -0
  23. genhpf/loggings/progress_bar.py +445 -0
  24. genhpf/models/__init__.py +73 -0
  25. genhpf/models/genhpf.py +244 -0
  26. genhpf/models/genhpf_mlm.py +64 -0
  27. genhpf/models/genhpf_predictor.py +73 -0
  28. genhpf/models/genhpf_simclr.py +58 -0
  29. genhpf/models/genhpf_wav2vec2.py +304 -0
  30. genhpf/modules/__init__.py +15 -0
  31. genhpf/modules/gather_layer.py +23 -0
  32. genhpf/modules/grad_multiply.py +12 -0
  33. genhpf/modules/gumbel_vector_quantizer.py +204 -0
  34. genhpf/modules/identity_layer.py +8 -0
  35. genhpf/modules/layer_norm.py +27 -0
  36. genhpf/modules/positional_encoding.py +24 -0
  37. genhpf/scripts/__init__.py +0 -0
  38. genhpf/scripts/preprocess/__init__.py +0 -0
  39. genhpf/scripts/preprocess/genhpf/README.md +75 -0
  40. genhpf/scripts/preprocess/genhpf/__init__.py +0 -0
  41. genhpf/scripts/preprocess/genhpf/ehrs/__init__.py +36 -0
  42. genhpf/scripts/preprocess/genhpf/ehrs/ehr.py +919 -0
  43. genhpf/scripts/preprocess/genhpf/ehrs/eicu.py +550 -0
  44. genhpf/scripts/preprocess/genhpf/ehrs/mimiciii.py +839 -0
  45. genhpf/scripts/preprocess/genhpf/ehrs/mimiciv.py +619 -0
  46. genhpf/scripts/preprocess/genhpf/main.py +175 -0
  47. genhpf/scripts/preprocess/genhpf/manifest.py +79 -0
  48. genhpf/scripts/preprocess/genhpf/sample_dataset.py +177 -0
  49. genhpf/scripts/preprocess/genhpf/utils/__init__.py +3 -0
  50. genhpf/scripts/preprocess/genhpf/utils/utils.py +16 -0
  51. genhpf/scripts/preprocess/manifest.py +83 -0
  52. genhpf/scripts/preprocess/preprocess_meds.py +674 -0
  53. genhpf/scripts/test.py +264 -0
  54. genhpf/scripts/train.py +365 -0
  55. genhpf/trainer.py +370 -0
  56. genhpf/utils/checkpoint_utils.py +171 -0
  57. genhpf/utils/data_utils.py +130 -0
  58. genhpf/utils/distributed_utils.py +497 -0
  59. genhpf/utils/file_io.py +170 -0
  60. genhpf/utils/pdb.py +38 -0
  61. genhpf/utils/utils.py +204 -0
  62. genhpf-1.0.11.dist-info/LICENSE +21 -0
  63. genhpf-1.0.11.dist-info/METADATA +202 -0
  64. genhpf-1.0.11.dist-info/RECORD +67 -0
  65. genhpf-1.0.11.dist-info/WHEEL +5 -0
  66. genhpf-1.0.11.dist-info/entry_points.txt +6 -0
  67. genhpf-1.0.11.dist-info/top_level.txt +1 -0
@@ -0,0 +1,75 @@
1
+ # Integrated-EHR-Pipeline
2
+ - Pre-processing code refining project in GenHPF
3
+
4
+ ## Install Requirements
5
+ - NOTE: This repository requires `python>=3.9` and `Java>=8`
6
+ ```
7
+ pip install numpy pandas tqdm treelib transformers pyspark
8
+ ```
9
+
10
+ ## How to Use
11
+ ```
12
+ main.py --ehr {eicu, mimiciii, mimiciv}
13
+ ```
14
+ - It automatically download the corresponding dataset from physionet, but requires appropriate certification.
15
+ - You can also use the downloaded dataset with `--data {data path}` option
16
+ - You can check sample implementation of pytorch `dataset` on `sample_dataset.py`
17
+
18
+ ### Arguments Description
19
+ - `--dest`: Output directory path (e.g., `--dest $HOME/output/mimiciv/`)
20
+ - `--ehr`: Type of EHR dataset (eicu/mimiciii/mimiciv)
21
+ - `--data`: Path to the input data directory (e.g., `--data $HOME/data/mimiciv/2.2/`)
22
+ - `--first_icu`: Process only first ICU admission
23
+ - `--emb_type`: Embedding type ("textbase" or "codebase")
24
+ - `--feature`: Feature selection mode ("all_features" or "select")
25
+
26
+ ### Prediction Tasks
27
+ You can add any combination of the following prediction tasks:
28
+ - `--readmission`
29
+ - `--mortality`
30
+ - `--los_3day`
31
+ - `--los_7day`
32
+ - `--long_term_mortality`
33
+ - `--final_acuity`
34
+ - `--imminent_discharge`
35
+ - `--diagnosis`
36
+ - `--creatinine`
37
+ - `--bilirubin`
38
+ - `--platelets`
39
+ - `--wbc`
40
+
41
+ ### Baseline Model Requirements
42
+
43
+ To reproduce the experiments from the GenHPF paper (including other baseline models), use the following configurations for data preprocessing:
44
+ - **GenHPF**: `--emb_type textbase --feature all_features`
45
+ - **SAND**: `--emb_type codebase --feature select`
46
+ - **DescEmb**: `--emb_type textbase --feature select`
47
+ - **Rajkomar**: `--emb_type textbase --feature all_features`
48
+
49
+ Example Command for GenHPF model data preparation:
50
+ ```shell script
51
+ python3 main.py \
52
+ --dest $HOME/output/mimiciv/ \
53
+ --ehr mimiciv \
54
+ --data $HOME/data/mimiciv/2.2/ \
55
+ --emb_type textbase \
56
+ --feature all_features \
57
+ --first_icu \
58
+ --mortality --readmission # add desired prediction tasks
59
+ ```
60
+
61
+ ### Cache Option
62
+ - `--cache`: Enable caching of intermediate processing results
63
+ - When enabled, the pipeline reuses previously processed data from cache instead of reprocessing
64
+ - Cache files are stored in `~/.cache/ehr` directory
65
+ - **Note**: Cache will use the previously processed task labels (mortality, readmission, etc.) from the cached results.
66
+
67
+ ### Resource Requirements
68
+ - Full pipeline processing (all tables) for each dataset (MIMIC-III, MIMIC-IV, eICU) requires:
69
+ - ~180GB RAM
70
+ - ~6 hours on 128 cores (AMD EPYC 7502 32-Core Processor)
71
+
72
+ ### External Resources
73
+ If automatic download fails, manually download these files and place them in the cache directory:
74
+ - CCS Diagnosis Codes: [ccs_multi_dx_tool_2015.csv](https://www.hcup-us.ahrq.gov/toolssoftware/ccs/Multi_Level_CCS_2015.zip)
75
+ - ICD Conversion: [icd10cmtoicd9gem.csv](https://data.nber.org/gem/icd10cmtoicd9gem.csv)
File without changes
@@ -0,0 +1,36 @@
1
+ import importlib
2
+ import os
3
+
4
+ from .ehr import EHR
5
+
6
+ EHR_REGISTRY = {}
7
+
8
+ __all__ = "EHR"
9
+
10
+
11
+ def register_ehr(name):
12
+ def register_ehr(cls):
13
+ if name in EHR_REGISTRY:
14
+ raise ValueError("Cannot register duplicate EHR ({})".format(name))
15
+ EHR_REGISTRY[name] = cls
16
+
17
+ return cls
18
+
19
+ return register_ehr
20
+
21
+
22
+ def import_ehrs(ehrs_dir, namespace):
23
+ for file in os.listdir(ehrs_dir):
24
+ path = os.path.join(ehrs_dir, file)
25
+ if (
26
+ not file.startswith("_")
27
+ and not file.startswith(".")
28
+ and (file.endswith(".py") or os.path.isdir(path))
29
+ ):
30
+ ehrs_name = file[: file.find(".py")] if file.endswith(".py") else file
31
+ importlib.import_module(namespace + "." + ehrs_name)
32
+
33
+
34
+ # automatically import any Python files in the ehrs/ directory
35
+ ehrs_dir = os.path.dirname(__file__)
36
+ import_ehrs(ehrs_dir, "ehrs")