genhpf 1.0.3__tar.gz → 1.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of genhpf might be problematic. Click here for more details.

Files changed (85) hide show
  1. {genhpf-1.0.3/src/genhpf.egg-info → genhpf-1.0.4}/PKG-INFO +2 -1
  2. {genhpf-1.0.3 → genhpf-1.0.4}/examples/test/genhpf/meds_hierarchical.yaml +1 -10
  3. {genhpf-1.0.3 → genhpf-1.0.4}/pyproject.toml +1 -0
  4. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/scripts/preprocess/preprocess_meds.py +37 -8
  5. {genhpf-1.0.3 → genhpf-1.0.4/src/genhpf.egg-info}/PKG-INFO +2 -1
  6. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf.egg-info/SOURCES.txt +0 -1
  7. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf.egg-info/requires.txt +1 -0
  8. genhpf-1.0.3/requirements.txt +0 -48
  9. {genhpf-1.0.3 → genhpf-1.0.4}/.gitignore +0 -0
  10. {genhpf-1.0.3 → genhpf-1.0.4}/.pre-commit-config.yaml +0 -0
  11. {genhpf-1.0.3 → genhpf-1.0.4}/LICENSE +0 -0
  12. {genhpf-1.0.3 → genhpf-1.0.4}/README.md +0 -0
  13. {genhpf-1.0.3 → genhpf-1.0.4}/examples/pretrain/mlm/genhpf/flattened_pt.yaml +0 -0
  14. {genhpf-1.0.3 → genhpf-1.0.4}/examples/pretrain/simclr/genhpf/genhpf_hierarchical_pt.yaml +0 -0
  15. {genhpf-1.0.3 → genhpf-1.0.4}/examples/pretrain/wav2vec2/genhpf/hierarchical_pt.yaml +0 -0
  16. {genhpf-1.0.3 → genhpf-1.0.4}/examples/test/genhpf/genhpf_flattened.yaml +0 -0
  17. {genhpf-1.0.3 → genhpf-1.0.4}/examples/test/genhpf/genhpf_hierarchical.yaml +0 -0
  18. {genhpf-1.0.3 → genhpf-1.0.4}/examples/train/genhpf/genhpf_flattened_ft.yaml +0 -0
  19. {genhpf-1.0.3 → genhpf-1.0.4}/examples/train/genhpf/genhpf_hierarchical_ft.yaml +0 -0
  20. {genhpf-1.0.3 → genhpf-1.0.4}/examples/train/genhpf/genhpf_hierarchical_scr.yaml +0 -0
  21. {genhpf-1.0.3 → genhpf-1.0.4}/examples/train/genhpf/meds_hierarchical_scr.yaml +0 -0
  22. {genhpf-1.0.3 → genhpf-1.0.4}/setup.cfg +0 -0
  23. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/__init__.py +0 -0
  24. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/configs/__init__.py +0 -0
  25. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/configs/config.yaml +0 -0
  26. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/configs/configs.py +0 -0
  27. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/configs/constants.py +0 -0
  28. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/configs/initialize.py +0 -0
  29. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/configs/utils.py +0 -0
  30. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/criterions/__init__.py +0 -0
  31. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/criterions/binary_cross_entropy.py +0 -0
  32. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/criterions/binary_cross_entropy_with_logits.py +0 -0
  33. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/criterions/criterion.py +0 -0
  34. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/criterions/cross_entropy.py +0 -0
  35. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/criterions/multi_task_criterion.py +0 -0
  36. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/criterions/simclr_criterion.py +0 -0
  37. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/criterions/wav2vec2_criterion.py +0 -0
  38. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/datasets/__init__.py +0 -0
  39. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/datasets/dataset.py +0 -0
  40. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/datasets/genhpf_dataset.py +0 -0
  41. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/datasets/meds_dataset.py +0 -0
  42. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/loggings/__init__.py +0 -0
  43. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/loggings/meters.py +0 -0
  44. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/loggings/metrics.py +0 -0
  45. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/loggings/progress_bar.py +0 -0
  46. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/models/__init__.py +0 -0
  47. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/models/genhpf.py +0 -0
  48. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/models/genhpf_mlm.py +0 -0
  49. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/models/genhpf_predictor.py +0 -0
  50. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/models/genhpf_simclr.py +0 -0
  51. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/models/genhpf_wav2vec2.py +0 -0
  52. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/modules/__init__.py +0 -0
  53. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/modules/gather_layer.py +0 -0
  54. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/modules/grad_multiply.py +0 -0
  55. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/modules/gumbel_vector_quantizer.py +0 -0
  56. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/modules/identity_layer.py +0 -0
  57. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/modules/layer_norm.py +0 -0
  58. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/modules/positional_encoding.py +0 -0
  59. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/scripts/__init__.py +0 -0
  60. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/scripts/preprocess/__init__.py +0 -0
  61. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/scripts/preprocess/genhpf/README.md +0 -0
  62. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/scripts/preprocess/genhpf/__init__.py +0 -0
  63. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/scripts/preprocess/genhpf/ehrs/__init__.py +0 -0
  64. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/scripts/preprocess/genhpf/ehrs/ehr.py +0 -0
  65. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/scripts/preprocess/genhpf/ehrs/eicu.py +0 -0
  66. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/scripts/preprocess/genhpf/ehrs/mimiciii.py +0 -0
  67. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/scripts/preprocess/genhpf/ehrs/mimiciv.py +0 -0
  68. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/scripts/preprocess/genhpf/main.py +0 -0
  69. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/scripts/preprocess/genhpf/manifest.py +0 -0
  70. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/scripts/preprocess/genhpf/sample_dataset.py +0 -0
  71. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/scripts/preprocess/genhpf/utils/__init__.py +0 -0
  72. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/scripts/preprocess/genhpf/utils/utils.py +0 -0
  73. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/scripts/preprocess/manifest.py +0 -0
  74. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/scripts/test.py +0 -0
  75. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/scripts/train.py +0 -0
  76. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/trainer.py +0 -0
  77. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/utils/checkpoint_utils.py +0 -0
  78. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/utils/data_utils.py +0 -0
  79. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/utils/distributed_utils.py +0 -0
  80. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/utils/file_io.py +0 -0
  81. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/utils/pdb.py +0 -0
  82. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/utils/utils.py +0 -0
  83. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf.egg-info/dependency_links.txt +0 -0
  84. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf.egg-info/entry_points.txt +0 -0
  85. {genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: genhpf
3
- Version: 1.0.3
3
+ Version: 1.0.4
4
4
  Summary: GenHPF: General Healthcare Predictive Framework with Multi-task Multi-source Learning
5
5
  Author-email: Jungwoo Oh <ojw0123@kaist.ac.kr>, Kyunghoon Hur <pacesun@kaist.ac.kr>
6
6
  License: MIT license
@@ -18,6 +18,7 @@ Requires-Dist: h5pickle==0.4.2
18
18
  Requires-Dist: scikit-learn==1.6.1
19
19
  Requires-Dist: pandas==2.2.3
20
20
  Requires-Dist: polars==1.17.1
21
+ Requires-Dist: pyarrow==17.0.0
21
22
  Provides-Extra: dev
22
23
  Requires-Dist: pre-commit; extra == "dev"
23
24
  Requires-Dist: black; extra == "dev"
@@ -10,12 +10,7 @@ meds:
10
10
  output_dir: ???
11
11
 
12
12
  checkpoint:
13
- save_dir: checkpoints
14
- best_checkpoint_metric: auroc
15
- maximize_best_checkpoint_metric: true
16
- save_interval: 1
17
- keep_last_epochs: 5
18
- patience: 10
13
+ load_checkpoint: ???
19
14
 
20
15
  dataset:
21
16
  data_format: meds
@@ -44,10 +39,6 @@ criterion:
44
39
  num_labels:
45
40
  - 1
46
41
 
47
- optimization:
48
- max_epoch: 100
49
- lr: 1e-4
50
-
51
42
  model:
52
43
  _name: genhpf_predictor
53
44
 
@@ -29,6 +29,7 @@ dependencies = [
29
29
  "scikit-learn==1.6.1",
30
30
  "pandas==2.2.3",
31
31
  "polars==1.17.1",
32
+ "pyarrow==17.0.0",
32
33
  ]
33
34
 
34
35
  [tool.setuptools_scm]
@@ -1,10 +1,10 @@
1
1
  import functools
2
+ import logging
2
3
  import glob
3
4
  import multiprocessing
4
5
  import os
5
6
  import re
6
7
  import shutil
7
- import warnings
8
8
  from argparse import ArgumentParser
9
9
  from bisect import bisect_left, bisect_right
10
10
  from datetime import datetime
@@ -17,6 +17,9 @@ import polars as pl
17
17
  from tqdm import tqdm
18
18
  from transformers import AutoTokenizer
19
19
 
20
+ logger = logging.getLogger(__name__)
21
+ logger.setLevel(logging.INFO)
22
+
20
23
  pool_manager = multiprocessing.Manager()
21
24
  warned_codes = pool_manager.list()
22
25
 
@@ -82,6 +85,13 @@ def get_parser():
82
85
  action="store_true",
83
86
  help="whether or not to rebase the output directory if exists.",
84
87
  )
88
+ parser.add_argument(
89
+ "--debug",
90
+ type=bool,
91
+ default=False,
92
+ help="whether or not to enable the debug mode, which forces the script to be run with "
93
+ "only one worker."
94
+ )
85
95
  parser.add_argument(
86
96
  "--workers",
87
97
  metavar="N",
@@ -117,6 +127,16 @@ def main():
117
127
  mimic_dir = Path(args.mimic_dir) if args.mimic_dir is not None else None
118
128
 
119
129
  num_workers = max(args.workers, 1)
130
+ if args.debug:
131
+ num_workers = 1
132
+ else:
133
+ cpu_count = multiprocessing.cpu_count()
134
+ if num_workers > cpu_count:
135
+ logger.warning(
136
+ f"Number of workers (--workers) is greater than the number of available CPUs "
137
+ f"({cpu_count}). Setting the number of workers to {cpu_count}."
138
+ )
139
+ num_workers = cpu_count
120
140
 
121
141
  if root_path.is_dir():
122
142
  data_paths = glob.glob(str(root_path / "**/*.csv"), recursive=True)
@@ -132,7 +152,8 @@ def main():
132
152
  else:
133
153
  if args.rebase:
134
154
  shutil.rmtree(output_dir)
135
- if output_dir.exists():
155
+ output_dir.mkdir()
156
+ elif output_dir.exists():
136
157
  if args.skip_if_exists:
137
158
  ls = glob.glob(str(output_dir / "**/*"), recursive=True)
138
159
  expected_files = []
@@ -142,7 +163,7 @@ def main():
142
163
  for i in range(num_workers)
143
164
  ])
144
165
  if set(expected_files).issubset(set(ls)):
145
- print(
166
+ logger.info(
146
167
  f"Output directory already contains the expected files. Skipping the "
147
168
  "processing as --skip-if-exists is set. If you want to rebase the directory, "
148
169
  "please run the script with --rebase."
@@ -151,9 +172,8 @@ def main():
151
172
  else:
152
173
  raise ValueError(
153
174
  f"File exists: '{str(output_dir.resolve())}'. If you want to rebase the "
154
- "directory, please run the script with --rebase."
175
+ "directory automatically, please run the script with --rebase."
155
176
  )
156
- output_dir.mkdir()
157
177
 
158
178
  label_col_name = args.cohort_label_name
159
179
 
@@ -330,7 +350,7 @@ def main():
330
350
  )
331
351
 
332
352
  # meds --> remed
333
- print("Processing...")
353
+ logger.info(f"Start processing {data_path}")
334
354
  if num_workers <= 1:
335
355
  length_per_subject_gathered = [meds_to_remed_partial(data)]
336
356
  del data
@@ -342,6 +362,15 @@ def main():
342
362
  for subject_id_chunk in subject_id_chunks:
343
363
  data_chunks.append(data.filter(pl.col("subject_id").is_in(subject_id_chunk)))
344
364
  del data
365
+
366
+ num_valid_data_chunks = sum(map(lambda x: len(x) > 0, data_chunks))
367
+ if num_valid_data_chunks < num_workers:
368
+ raise ValueError(
369
+ "Number of valid data chunks (= number of unique subjects) were smaller "
370
+ "than the specified num workers (--workers) due to the small size of data. "
371
+ "Consider reducing the number of workers."
372
+ )
373
+
345
374
  pool = multiprocessing.get_context("spawn").Pool(processes=num_workers)
346
375
  # the order is preserved
347
376
  length_per_subject_gathered = pool.map(meds_to_remed_partial, data_chunks)
@@ -350,7 +379,7 @@ def main():
350
379
  del data_chunks
351
380
 
352
381
  if len(length_per_subject_gathered) != num_workers:
353
- print(
382
+ raise ValueError(
354
383
  "Number of processed workers were smaller than the specified num workers "
355
384
  "(--workers) due to the small size of data. Consider reducing the number of "
356
385
  "workers."
@@ -420,7 +449,7 @@ def meds_to_remed(
420
449
 
421
450
  if do_break and col_event not in warned_codes:
422
451
  warned_codes.append(col_event)
423
- warnings.warn(
452
+ logger.warning(
424
453
  "The dataset contains some codes that are not specified in "
425
454
  "the codes metadata, which may not be intended. Note that we "
426
455
  f"process this code as it is for now: {col_event}."
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: genhpf
3
- Version: 1.0.3
3
+ Version: 1.0.4
4
4
  Summary: GenHPF: General Healthcare Predictive Framework with Multi-task Multi-source Learning
5
5
  Author-email: Jungwoo Oh <ojw0123@kaist.ac.kr>, Kyunghoon Hur <pacesun@kaist.ac.kr>
6
6
  License: MIT license
@@ -18,6 +18,7 @@ Requires-Dist: h5pickle==0.4.2
18
18
  Requires-Dist: scikit-learn==1.6.1
19
19
  Requires-Dist: pandas==2.2.3
20
20
  Requires-Dist: polars==1.17.1
21
+ Requires-Dist: pyarrow==17.0.0
21
22
  Provides-Extra: dev
22
23
  Requires-Dist: pre-commit; extra == "dev"
23
24
  Requires-Dist: black; extra == "dev"
@@ -3,7 +3,6 @@
3
3
  LICENSE
4
4
  README.md
5
5
  pyproject.toml
6
- requirements.txt
7
6
  examples/pretrain/mlm/genhpf/flattened_pt.yaml
8
7
  examples/pretrain/simclr/genhpf/genhpf_hierarchical_pt.yaml
9
8
  examples/pretrain/wav2vec2/genhpf/hierarchical_pt.yaml
@@ -6,6 +6,7 @@ h5pickle==0.4.2
6
6
  scikit-learn==1.6.1
7
7
  pandas==2.2.3
8
8
  polars==1.17.1
9
+ pyarrow==17.0.0
9
10
 
10
11
  [dev]
11
12
  pre-commit
@@ -1,48 +0,0 @@
1
- appdirs==1.4.4
2
- axial-positional-embedding==0.2.1
3
- cachetools==5.3.1
4
- certifi==2023.5.7
5
- charset-normalizer==3.1.0
6
- click==8.1.3
7
- docker-pycreds==0.4.0
8
- einops==0.6.1
9
- filelock==3.12.0
10
- fsspec==2023.5.0
11
- gitdb==4.0.10
12
- GitPython==3.1.31
13
- h5pickle==0.4.2
14
- h5py==3.8.0
15
- huggingface-hub==0.14.1
16
- idna==3.4
17
- joblib==1.2.0
18
- local-attention==1.8.6
19
- numpy==1.24.3
20
- packaging==23.1
21
- pandas==2.0.2
22
- pathtools==0.1.2
23
- performer-pytorch==1.1.4
24
- Pillow==9.5.0
25
- protobuf==4.23.2
26
- psutil==5.9.5
27
- python-dateutil==2.8.2
28
- pytz==2023.3
29
- PyYAML==6.0
30
- regex==2023.5.5
31
- requests==2.31.0
32
- scikit-learn==1.2.2
33
- scipy==1.10.1
34
- sentry-sdk==1.24.0
35
- setproctitle==1.3.2
36
- six==1.16.0
37
- smmap==5.0.0
38
- threadpoolctl==3.1.0
39
- tokenizers==0.13.3
40
- tqdm==4.65.0
41
- transformers==4.29.2
42
- typing_extensions==4.6.2
43
- tzdata==2023.3
44
- urllib3==1.26.16
45
- wandb==0.15.3
46
- pyspark==3.5.0
47
- pyarrow==14.0.1
48
- treelib==1.7.0
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes