genhpf 1.0.2__tar.gz → 1.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of genhpf might be problematic. Click here for more details.

Files changed (85) hide show
  1. {genhpf-1.0.2/src/genhpf.egg-info → genhpf-1.0.4}/PKG-INFO +2 -1
  2. {genhpf-1.0.2 → genhpf-1.0.4}/examples/test/genhpf/meds_hierarchical.yaml +1 -10
  3. {genhpf-1.0.2 → genhpf-1.0.4}/pyproject.toml +1 -0
  4. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/scripts/preprocess/genhpf/main.py +5 -4
  5. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/scripts/preprocess/preprocess_meds.py +76 -22
  6. {genhpf-1.0.2 → genhpf-1.0.4/src/genhpf.egg-info}/PKG-INFO +2 -1
  7. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf.egg-info/SOURCES.txt +0 -1
  8. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf.egg-info/requires.txt +1 -0
  9. genhpf-1.0.2/requirements.txt +0 -48
  10. {genhpf-1.0.2 → genhpf-1.0.4}/.gitignore +0 -0
  11. {genhpf-1.0.2 → genhpf-1.0.4}/.pre-commit-config.yaml +0 -0
  12. {genhpf-1.0.2 → genhpf-1.0.4}/LICENSE +0 -0
  13. {genhpf-1.0.2 → genhpf-1.0.4}/README.md +0 -0
  14. {genhpf-1.0.2 → genhpf-1.0.4}/examples/pretrain/mlm/genhpf/flattened_pt.yaml +0 -0
  15. {genhpf-1.0.2 → genhpf-1.0.4}/examples/pretrain/simclr/genhpf/genhpf_hierarchical_pt.yaml +0 -0
  16. {genhpf-1.0.2 → genhpf-1.0.4}/examples/pretrain/wav2vec2/genhpf/hierarchical_pt.yaml +0 -0
  17. {genhpf-1.0.2 → genhpf-1.0.4}/examples/test/genhpf/genhpf_flattened.yaml +0 -0
  18. {genhpf-1.0.2 → genhpf-1.0.4}/examples/test/genhpf/genhpf_hierarchical.yaml +0 -0
  19. {genhpf-1.0.2 → genhpf-1.0.4}/examples/train/genhpf/genhpf_flattened_ft.yaml +0 -0
  20. {genhpf-1.0.2 → genhpf-1.0.4}/examples/train/genhpf/genhpf_hierarchical_ft.yaml +0 -0
  21. {genhpf-1.0.2 → genhpf-1.0.4}/examples/train/genhpf/genhpf_hierarchical_scr.yaml +0 -0
  22. {genhpf-1.0.2 → genhpf-1.0.4}/examples/train/genhpf/meds_hierarchical_scr.yaml +0 -0
  23. {genhpf-1.0.2 → genhpf-1.0.4}/setup.cfg +0 -0
  24. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/__init__.py +0 -0
  25. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/configs/__init__.py +0 -0
  26. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/configs/config.yaml +0 -0
  27. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/configs/configs.py +0 -0
  28. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/configs/constants.py +0 -0
  29. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/configs/initialize.py +0 -0
  30. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/configs/utils.py +0 -0
  31. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/criterions/__init__.py +0 -0
  32. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/criterions/binary_cross_entropy.py +0 -0
  33. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/criterions/binary_cross_entropy_with_logits.py +0 -0
  34. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/criterions/criterion.py +0 -0
  35. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/criterions/cross_entropy.py +0 -0
  36. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/criterions/multi_task_criterion.py +0 -0
  37. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/criterions/simclr_criterion.py +0 -0
  38. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/criterions/wav2vec2_criterion.py +0 -0
  39. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/datasets/__init__.py +0 -0
  40. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/datasets/dataset.py +0 -0
  41. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/datasets/genhpf_dataset.py +0 -0
  42. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/datasets/meds_dataset.py +0 -0
  43. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/loggings/__init__.py +0 -0
  44. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/loggings/meters.py +0 -0
  45. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/loggings/metrics.py +0 -0
  46. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/loggings/progress_bar.py +0 -0
  47. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/models/__init__.py +0 -0
  48. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/models/genhpf.py +0 -0
  49. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/models/genhpf_mlm.py +0 -0
  50. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/models/genhpf_predictor.py +0 -0
  51. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/models/genhpf_simclr.py +0 -0
  52. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/models/genhpf_wav2vec2.py +0 -0
  53. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/modules/__init__.py +0 -0
  54. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/modules/gather_layer.py +0 -0
  55. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/modules/grad_multiply.py +0 -0
  56. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/modules/gumbel_vector_quantizer.py +0 -0
  57. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/modules/identity_layer.py +0 -0
  58. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/modules/layer_norm.py +0 -0
  59. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/modules/positional_encoding.py +0 -0
  60. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/scripts/__init__.py +0 -0
  61. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/scripts/preprocess/__init__.py +0 -0
  62. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/scripts/preprocess/genhpf/README.md +0 -0
  63. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/scripts/preprocess/genhpf/__init__.py +0 -0
  64. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/scripts/preprocess/genhpf/ehrs/__init__.py +0 -0
  65. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/scripts/preprocess/genhpf/ehrs/ehr.py +0 -0
  66. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/scripts/preprocess/genhpf/ehrs/eicu.py +0 -0
  67. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/scripts/preprocess/genhpf/ehrs/mimiciii.py +0 -0
  68. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/scripts/preprocess/genhpf/ehrs/mimiciv.py +0 -0
  69. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/scripts/preprocess/genhpf/manifest.py +0 -0
  70. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/scripts/preprocess/genhpf/sample_dataset.py +0 -0
  71. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/scripts/preprocess/genhpf/utils/__init__.py +0 -0
  72. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/scripts/preprocess/genhpf/utils/utils.py +0 -0
  73. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/scripts/preprocess/manifest.py +0 -0
  74. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/scripts/test.py +0 -0
  75. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/scripts/train.py +0 -0
  76. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/trainer.py +0 -0
  77. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/utils/checkpoint_utils.py +0 -0
  78. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/utils/data_utils.py +0 -0
  79. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/utils/distributed_utils.py +0 -0
  80. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/utils/file_io.py +0 -0
  81. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/utils/pdb.py +0 -0
  82. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf/utils/utils.py +0 -0
  83. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf.egg-info/dependency_links.txt +0 -0
  84. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf.egg-info/entry_points.txt +0 -0
  85. {genhpf-1.0.2 → genhpf-1.0.4}/src/genhpf.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: genhpf
3
- Version: 1.0.2
3
+ Version: 1.0.4
4
4
  Summary: GenHPF: General Healthcare Predictive Framework with Multi-task Multi-source Learning
5
5
  Author-email: Jungwoo Oh <ojw0123@kaist.ac.kr>, Kyunghoon Hur <pacesun@kaist.ac.kr>
6
6
  License: MIT license
@@ -18,6 +18,7 @@ Requires-Dist: h5pickle==0.4.2
18
18
  Requires-Dist: scikit-learn==1.6.1
19
19
  Requires-Dist: pandas==2.2.3
20
20
  Requires-Dist: polars==1.17.1
21
+ Requires-Dist: pyarrow==17.0.0
21
22
  Provides-Extra: dev
22
23
  Requires-Dist: pre-commit; extra == "dev"
23
24
  Requires-Dist: black; extra == "dev"
@@ -10,12 +10,7 @@ meds:
10
10
  output_dir: ???
11
11
 
12
12
  checkpoint:
13
- save_dir: checkpoints
14
- best_checkpoint_metric: auroc
15
- maximize_best_checkpoint_metric: true
16
- save_interval: 1
17
- keep_last_epochs: 5
18
- patience: 10
13
+ load_checkpoint: ???
19
14
 
20
15
  dataset:
21
16
  data_format: meds
@@ -44,10 +39,6 @@ criterion:
44
39
  num_labels:
45
40
  - 1
46
41
 
47
- optimization:
48
- max_epoch: 100
49
- lr: 1e-4
50
-
51
42
  model:
52
43
  _name: genhpf_predictor
53
44
 
@@ -29,6 +29,7 @@ dependencies = [
29
29
  "scikit-learn==1.6.1",
30
30
  "pandas==2.2.3",
31
31
  "polars==1.17.1",
32
+ "pyarrow==17.0.0",
32
33
  ]
33
34
 
34
35
  [tool.setuptools_scm]
@@ -151,7 +151,10 @@ def get_parser():
151
151
  return parser
152
152
 
153
153
 
154
- def main(args):
154
+ def main():
155
+ parser = get_parser()
156
+ args = parser.parse_args()
157
+
155
158
  if not os.path.exists(args.dest):
156
159
  os.makedirs(args.dest)
157
160
 
@@ -169,6 +172,4 @@ def main(args):
169
172
 
170
173
 
171
174
  if __name__ == "__main__":
172
- parser = get_parser()
173
- args = parser.parse_args()
174
- main(args)
175
+ main()
@@ -1,10 +1,10 @@
1
1
  import functools
2
+ import logging
2
3
  import glob
3
4
  import multiprocessing
4
5
  import os
5
6
  import re
6
7
  import shutil
7
- import warnings
8
8
  from argparse import ArgumentParser
9
9
  from bisect import bisect_left, bisect_right
10
10
  from datetime import datetime
@@ -17,6 +17,9 @@ import polars as pl
17
17
  from tqdm import tqdm
18
18
  from transformers import AutoTokenizer
19
19
 
20
+ logger = logging.getLogger(__name__)
21
+ logger.setLevel(logging.INFO)
22
+
20
23
  pool_manager = multiprocessing.Manager()
21
24
  warned_codes = pool_manager.list()
22
25
 
@@ -71,11 +74,24 @@ def get_parser():
71
74
  default="outputs",
72
75
  help="directory to save processed outputs.",
73
76
  )
77
+ parser.add_argument(
78
+ "--skip-if-exists",
79
+ action="store_true",
80
+ help="whether or not to skip the processing if the output directory already "
81
+ "exists.",
82
+ )
74
83
  parser.add_argument(
75
84
  "--rebase",
76
85
  action="store_true",
77
86
  help="whether or not to rebase the output directory if exists.",
78
87
  )
88
+ parser.add_argument(
89
+ "--debug",
90
+ type=bool,
91
+ default=False,
92
+ help="whether or not to enable the debug mode, which forces the script to be run with "
93
+ "only one worker."
94
+ )
79
95
  parser.add_argument(
80
96
  "--workers",
81
97
  metavar="N",
@@ -101,23 +117,26 @@ def get_parser():
101
117
  return parser
102
118
 
103
119
 
104
- def main(args):
120
+ def main():
121
+ parser = get_parser()
122
+ args = parser.parse_args()
123
+
105
124
  root_path = Path(args.root)
106
125
  output_dir = Path(args.output_dir)
107
126
  metadata_dir = Path(args.metadata_dir)
108
127
  mimic_dir = Path(args.mimic_dir) if args.mimic_dir is not None else None
109
128
 
110
- if not output_dir.exists():
111
- output_dir.mkdir()
129
+ num_workers = max(args.workers, 1)
130
+ if args.debug:
131
+ num_workers = 1
112
132
  else:
113
- if args.rebase:
114
- shutil.rmtree(output_dir)
115
- if output_dir.exists():
116
- raise ValueError(
117
- f"File exists: '{str(output_dir.resolve())}'. If you want to rebase the "
118
- "directory, please run the script with --rebase."
133
+ cpu_count = multiprocessing.cpu_count()
134
+ if num_workers > cpu_count:
135
+ logger.warning(
136
+ f"Number of workers (--workers) is greater than the number of available CPUs "
137
+ f"({cpu_count}). Setting the number of workers to {cpu_count}."
119
138
  )
120
- output_dir.mkdir()
139
+ num_workers = cpu_count
121
140
 
122
141
  if root_path.is_dir():
123
142
  data_paths = glob.glob(str(root_path / "**/*.csv"), recursive=True)
@@ -128,6 +147,34 @@ def main(args):
128
147
  else:
129
148
  data_paths = [root_path]
130
149
 
150
+ if not output_dir.exists():
151
+ output_dir.mkdir()
152
+ else:
153
+ if args.rebase:
154
+ shutil.rmtree(output_dir)
155
+ output_dir.mkdir()
156
+ elif output_dir.exists():
157
+ if args.skip_if_exists:
158
+ ls = glob.glob(str(output_dir / "**/*"), recursive=True)
159
+ expected_files = []
160
+ for subset in set(os.path.dirname(x) for x in data_paths):
161
+ expected_files.extend([
162
+ os.path.join(str(output_dir), os.path.basename(subset), f"{i}.h5")
163
+ for i in range(num_workers)
164
+ ])
165
+ if set(expected_files).issubset(set(ls)):
166
+ logger.info(
167
+ f"Output directory already contains the expected files. Skipping the "
168
+ "processing as --skip-if-exists is set. If you want to rebase the directory, "
169
+ "please run the script with --rebase."
170
+ )
171
+ return
172
+ else:
173
+ raise ValueError(
174
+ f"File exists: '{str(output_dir.resolve())}'. If you want to rebase the "
175
+ "directory automatically, please run the script with --rebase."
176
+ )
177
+
131
178
  label_col_name = args.cohort_label_name
132
179
 
133
180
  tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
@@ -295,7 +342,7 @@ def main(args):
295
342
  codes_metadata,
296
343
  output_dir,
297
344
  output_name,
298
- args.workers,
345
+ num_workers,
299
346
  d_items,
300
347
  d_labitems,
301
348
  warned_codes,
@@ -303,27 +350,36 @@ def main(args):
303
350
  )
304
351
 
305
352
  # meds --> remed
306
- print("Processing...")
307
- if args.workers <= 1:
353
+ logger.info(f"Start processing {data_path}")
354
+ if num_workers <= 1:
308
355
  length_per_subject_gathered = [meds_to_remed_partial(data)]
309
356
  del data
310
357
  else:
311
358
  subject_ids = data["subject_id"].unique().to_list()
312
- n = args.workers
359
+ n = num_workers
313
360
  subject_id_chunks = [subject_ids[i::n] for i in range(n)]
314
361
  data_chunks = []
315
362
  for subject_id_chunk in subject_id_chunks:
316
363
  data_chunks.append(data.filter(pl.col("subject_id").is_in(subject_id_chunk)))
317
364
  del data
318
- pool = multiprocessing.get_context("spawn").Pool(processes=args.workers)
365
+
366
+ num_valid_data_chunks = sum(map(lambda x: len(x) > 0, data_chunks))
367
+ if num_valid_data_chunks < num_workers:
368
+ raise ValueError(
369
+ "Number of valid data chunks (= number of unique subjects) were smaller "
370
+ "than the specified num workers (--workers) due to the small size of data. "
371
+ "Consider reducing the number of workers."
372
+ )
373
+
374
+ pool = multiprocessing.get_context("spawn").Pool(processes=num_workers)
319
375
  # the order is preserved
320
376
  length_per_subject_gathered = pool.map(meds_to_remed_partial, data_chunks)
321
377
  pool.close()
322
378
  pool.join()
323
379
  del data_chunks
324
380
 
325
- if len(length_per_subject_gathered) != args.workers:
326
- print(
381
+ if len(length_per_subject_gathered) != num_workers:
382
+ raise ValueError(
327
383
  "Number of processed workers were smaller than the specified num workers "
328
384
  "(--workers) due to the small size of data. Consider reducing the number of "
329
385
  "workers."
@@ -393,7 +449,7 @@ def meds_to_remed(
393
449
 
394
450
  if do_break and col_event not in warned_codes:
395
451
  warned_codes.append(col_event)
396
- warnings.warn(
452
+ logger.warning(
397
453
  "The dataset contains some codes that are not specified in "
398
454
  "the codes metadata, which may not be intended. Note that we "
399
455
  f"process this code as it is for now: {col_event}."
@@ -579,6 +635,4 @@ def meds_to_remed(
579
635
 
580
636
 
581
637
  if __name__ == "__main__":
582
- parser = get_parser()
583
- args = parser.parse_args()
584
- main(args)
638
+ main()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: genhpf
3
- Version: 1.0.2
3
+ Version: 1.0.4
4
4
  Summary: GenHPF: General Healthcare Predictive Framework with Multi-task Multi-source Learning
5
5
  Author-email: Jungwoo Oh <ojw0123@kaist.ac.kr>, Kyunghoon Hur <pacesun@kaist.ac.kr>
6
6
  License: MIT license
@@ -18,6 +18,7 @@ Requires-Dist: h5pickle==0.4.2
18
18
  Requires-Dist: scikit-learn==1.6.1
19
19
  Requires-Dist: pandas==2.2.3
20
20
  Requires-Dist: polars==1.17.1
21
+ Requires-Dist: pyarrow==17.0.0
21
22
  Provides-Extra: dev
22
23
  Requires-Dist: pre-commit; extra == "dev"
23
24
  Requires-Dist: black; extra == "dev"
@@ -3,7 +3,6 @@
3
3
  LICENSE
4
4
  README.md
5
5
  pyproject.toml
6
- requirements.txt
7
6
  examples/pretrain/mlm/genhpf/flattened_pt.yaml
8
7
  examples/pretrain/simclr/genhpf/genhpf_hierarchical_pt.yaml
9
8
  examples/pretrain/wav2vec2/genhpf/hierarchical_pt.yaml
@@ -6,6 +6,7 @@ h5pickle==0.4.2
6
6
  scikit-learn==1.6.1
7
7
  pandas==2.2.3
8
8
  polars==1.17.1
9
+ pyarrow==17.0.0
9
10
 
10
11
  [dev]
11
12
  pre-commit
@@ -1,48 +0,0 @@
1
- appdirs==1.4.4
2
- axial-positional-embedding==0.2.1
3
- cachetools==5.3.1
4
- certifi==2023.5.7
5
- charset-normalizer==3.1.0
6
- click==8.1.3
7
- docker-pycreds==0.4.0
8
- einops==0.6.1
9
- filelock==3.12.0
10
- fsspec==2023.5.0
11
- gitdb==4.0.10
12
- GitPython==3.1.31
13
- h5pickle==0.4.2
14
- h5py==3.8.0
15
- huggingface-hub==0.14.1
16
- idna==3.4
17
- joblib==1.2.0
18
- local-attention==1.8.6
19
- numpy==1.24.3
20
- packaging==23.1
21
- pandas==2.0.2
22
- pathtools==0.1.2
23
- performer-pytorch==1.1.4
24
- Pillow==9.5.0
25
- protobuf==4.23.2
26
- psutil==5.9.5
27
- python-dateutil==2.8.2
28
- pytz==2023.3
29
- PyYAML==6.0
30
- regex==2023.5.5
31
- requests==2.31.0
32
- scikit-learn==1.2.2
33
- scipy==1.10.1
34
- sentry-sdk==1.24.0
35
- setproctitle==1.3.2
36
- six==1.16.0
37
- smmap==5.0.0
38
- threadpoolctl==3.1.0
39
- tokenizers==0.13.3
40
- tqdm==4.65.0
41
- transformers==4.29.2
42
- typing_extensions==4.6.2
43
- tzdata==2023.3
44
- urllib3==1.26.16
45
- wandb==0.15.3
46
- pyspark==3.5.0
47
- pyarrow==14.0.1
48
- treelib==1.7.0
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes