genhpf 1.0.3__py3-none-any.whl → 1.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of genhpf might be problematic. Click here for more details.
- genhpf/scripts/preprocess/preprocess_meds.py +37 -8
- {genhpf-1.0.3.dist-info → genhpf-1.0.4.dist-info}/METADATA +2 -1
- {genhpf-1.0.3.dist-info → genhpf-1.0.4.dist-info}/RECORD +7 -7
- {genhpf-1.0.3.dist-info → genhpf-1.0.4.dist-info}/LICENSE +0 -0
- {genhpf-1.0.3.dist-info → genhpf-1.0.4.dist-info}/WHEEL +0 -0
- {genhpf-1.0.3.dist-info → genhpf-1.0.4.dist-info}/entry_points.txt +0 -0
- {genhpf-1.0.3.dist-info → genhpf-1.0.4.dist-info}/top_level.txt +0 -0
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import functools
|
|
2
|
+
import logging
|
|
2
3
|
import glob
|
|
3
4
|
import multiprocessing
|
|
4
5
|
import os
|
|
5
6
|
import re
|
|
6
7
|
import shutil
|
|
7
|
-
import warnings
|
|
8
8
|
from argparse import ArgumentParser
|
|
9
9
|
from bisect import bisect_left, bisect_right
|
|
10
10
|
from datetime import datetime
|
|
@@ -17,6 +17,9 @@ import polars as pl
|
|
|
17
17
|
from tqdm import tqdm
|
|
18
18
|
from transformers import AutoTokenizer
|
|
19
19
|
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
logger.setLevel(logging.INFO)
|
|
22
|
+
|
|
20
23
|
pool_manager = multiprocessing.Manager()
|
|
21
24
|
warned_codes = pool_manager.list()
|
|
22
25
|
|
|
@@ -82,6 +85,13 @@ def get_parser():
|
|
|
82
85
|
action="store_true",
|
|
83
86
|
help="whether or not to rebase the output directory if exists.",
|
|
84
87
|
)
|
|
88
|
+
parser.add_argument(
|
|
89
|
+
"--debug",
|
|
90
|
+
type=bool,
|
|
91
|
+
default=False,
|
|
92
|
+
help="whether or not to enable the debug mode, which forces the script to be run with "
|
|
93
|
+
"only one worker."
|
|
94
|
+
)
|
|
85
95
|
parser.add_argument(
|
|
86
96
|
"--workers",
|
|
87
97
|
metavar="N",
|
|
@@ -117,6 +127,16 @@ def main():
|
|
|
117
127
|
mimic_dir = Path(args.mimic_dir) if args.mimic_dir is not None else None
|
|
118
128
|
|
|
119
129
|
num_workers = max(args.workers, 1)
|
|
130
|
+
if args.debug:
|
|
131
|
+
num_workers = 1
|
|
132
|
+
else:
|
|
133
|
+
cpu_count = multiprocessing.cpu_count()
|
|
134
|
+
if num_workers > cpu_count:
|
|
135
|
+
logger.warning(
|
|
136
|
+
f"Number of workers (--workers) is greater than the number of available CPUs "
|
|
137
|
+
f"({cpu_count}). Setting the number of workers to {cpu_count}."
|
|
138
|
+
)
|
|
139
|
+
num_workers = cpu_count
|
|
120
140
|
|
|
121
141
|
if root_path.is_dir():
|
|
122
142
|
data_paths = glob.glob(str(root_path / "**/*.csv"), recursive=True)
|
|
@@ -132,7 +152,8 @@ def main():
|
|
|
132
152
|
else:
|
|
133
153
|
if args.rebase:
|
|
134
154
|
shutil.rmtree(output_dir)
|
|
135
|
-
|
|
155
|
+
output_dir.mkdir()
|
|
156
|
+
elif output_dir.exists():
|
|
136
157
|
if args.skip_if_exists:
|
|
137
158
|
ls = glob.glob(str(output_dir / "**/*"), recursive=True)
|
|
138
159
|
expected_files = []
|
|
@@ -142,7 +163,7 @@ def main():
|
|
|
142
163
|
for i in range(num_workers)
|
|
143
164
|
])
|
|
144
165
|
if set(expected_files).issubset(set(ls)):
|
|
145
|
-
|
|
166
|
+
logger.info(
|
|
146
167
|
f"Output directory already contains the expected files. Skipping the "
|
|
147
168
|
"processing as --skip-if-exists is set. If you want to rebase the directory, "
|
|
148
169
|
"please run the script with --rebase."
|
|
@@ -151,9 +172,8 @@ def main():
|
|
|
151
172
|
else:
|
|
152
173
|
raise ValueError(
|
|
153
174
|
f"File exists: '{str(output_dir.resolve())}'. If you want to rebase the "
|
|
154
|
-
"directory, please run the script with --rebase."
|
|
175
|
+
"directory automatically, please run the script with --rebase."
|
|
155
176
|
)
|
|
156
|
-
output_dir.mkdir()
|
|
157
177
|
|
|
158
178
|
label_col_name = args.cohort_label_name
|
|
159
179
|
|
|
@@ -330,7 +350,7 @@ def main():
|
|
|
330
350
|
)
|
|
331
351
|
|
|
332
352
|
# meds --> remed
|
|
333
|
-
|
|
353
|
+
logger.info(f"Start processing {data_path}")
|
|
334
354
|
if num_workers <= 1:
|
|
335
355
|
length_per_subject_gathered = [meds_to_remed_partial(data)]
|
|
336
356
|
del data
|
|
@@ -342,6 +362,15 @@ def main():
|
|
|
342
362
|
for subject_id_chunk in subject_id_chunks:
|
|
343
363
|
data_chunks.append(data.filter(pl.col("subject_id").is_in(subject_id_chunk)))
|
|
344
364
|
del data
|
|
365
|
+
|
|
366
|
+
num_valid_data_chunks = sum(map(lambda x: len(x) > 0, data_chunks))
|
|
367
|
+
if num_valid_data_chunks < num_workers:
|
|
368
|
+
raise ValueError(
|
|
369
|
+
"Number of valid data chunks (= number of unique subjects) were smaller "
|
|
370
|
+
"than the specified num workers (--workers) due to the small size of data. "
|
|
371
|
+
"Consider reducing the number of workers."
|
|
372
|
+
)
|
|
373
|
+
|
|
345
374
|
pool = multiprocessing.get_context("spawn").Pool(processes=num_workers)
|
|
346
375
|
# the order is preserved
|
|
347
376
|
length_per_subject_gathered = pool.map(meds_to_remed_partial, data_chunks)
|
|
@@ -350,7 +379,7 @@ def main():
|
|
|
350
379
|
del data_chunks
|
|
351
380
|
|
|
352
381
|
if len(length_per_subject_gathered) != num_workers:
|
|
353
|
-
|
|
382
|
+
raise ValueError(
|
|
354
383
|
"Number of processed workers were smaller than the specified num workers "
|
|
355
384
|
"(--workers) due to the small size of data. Consider reducing the number of "
|
|
356
385
|
"workers."
|
|
@@ -420,7 +449,7 @@ def meds_to_remed(
|
|
|
420
449
|
|
|
421
450
|
if do_break and col_event not in warned_codes:
|
|
422
451
|
warned_codes.append(col_event)
|
|
423
|
-
|
|
452
|
+
logger.warning(
|
|
424
453
|
"The dataset contains some codes that are not specified in "
|
|
425
454
|
"the codes metadata, which may not be intended. Note that we "
|
|
426
455
|
f"process this code as it is for now: {col_event}."
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: genhpf
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.4
|
|
4
4
|
Summary: GenHPF: General Healthcare Predictive Framework with Multi-task Multi-source Learning
|
|
5
5
|
Author-email: Jungwoo Oh <ojw0123@kaist.ac.kr>, Kyunghoon Hur <pacesun@kaist.ac.kr>
|
|
6
6
|
License: MIT license
|
|
@@ -18,6 +18,7 @@ Requires-Dist: h5pickle==0.4.2
|
|
|
18
18
|
Requires-Dist: scikit-learn==1.6.1
|
|
19
19
|
Requires-Dist: pandas==2.2.3
|
|
20
20
|
Requires-Dist: polars==1.17.1
|
|
21
|
+
Requires-Dist: pyarrow==17.0.0
|
|
21
22
|
Provides-Extra: dev
|
|
22
23
|
Requires-Dist: pre-commit; extra == "dev"
|
|
23
24
|
Requires-Dist: black; extra == "dev"
|
|
@@ -40,7 +40,7 @@ genhpf/scripts/test.py,sha256=wWi7OLqxsW9blj21m3RTirvziQ5UpkjkngOkgkE3Vb4,10149
|
|
|
40
40
|
genhpf/scripts/train.py,sha256=5f5PYOkiW7BahbFArvdOguAzUdDnY4Urw7Nx3aJ4kjs,12488
|
|
41
41
|
genhpf/scripts/preprocess/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
42
42
|
genhpf/scripts/preprocess/manifest.py,sha256=ZIK16e4vs_cS2K_tM1GaT38hc1nBHk6JB9Uga6OjgU4,2711
|
|
43
|
-
genhpf/scripts/preprocess/preprocess_meds.py,sha256=
|
|
43
|
+
genhpf/scripts/preprocess/preprocess_meds.py,sha256=x5R4KDzzB-21IUHjfkyo4-Be1t9U4oaurjm5VhxZ5Rw,25050
|
|
44
44
|
genhpf/scripts/preprocess/genhpf/README.md,sha256=qtpM_ABJk5yI8xbsUj1sZ71yX5bybx9ZvAymo0Lh5Vc,2877
|
|
45
45
|
genhpf/scripts/preprocess/genhpf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
46
46
|
genhpf/scripts/preprocess/genhpf/main.py,sha256=EF3sce0ltowMHIGK7zLEQEOnzOWQ_WJxoBowknHV3mQ,6161
|
|
@@ -59,9 +59,9 @@ genhpf/utils/distributed_utils.py,sha256=000xKlw8SLoSH16o6n2bB3eueGR0aVD_DufPYES
|
|
|
59
59
|
genhpf/utils/file_io.py,sha256=hnZXdMtAibfFDoIfn-SDusl-v7ZImeUEh0eD2MIxbG4,4919
|
|
60
60
|
genhpf/utils/pdb.py,sha256=400rk1pVfOpVpzKIFHnTRlZ2VCtBqRh9G-pRRwu2Oqo,930
|
|
61
61
|
genhpf/utils/utils.py,sha256=BoC_7Gz8uCHbUBCpcXGBMD-5irApi_6xM7nU-2ac4aA,6176
|
|
62
|
-
genhpf-1.0.
|
|
63
|
-
genhpf-1.0.
|
|
64
|
-
genhpf-1.0.
|
|
65
|
-
genhpf-1.0.
|
|
66
|
-
genhpf-1.0.
|
|
67
|
-
genhpf-1.0.
|
|
62
|
+
genhpf-1.0.4.dist-info/LICENSE,sha256=VK_rvhY2Xi_DAIZHtauni5O9-1_do5SNWjrskv4amg8,1065
|
|
63
|
+
genhpf-1.0.4.dist-info/METADATA,sha256=Mgs4WysCKfBf4E2Jik2BgMdZPM8w1-rL5NoqgeqM5Zo,10589
|
|
64
|
+
genhpf-1.0.4.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
|
65
|
+
genhpf-1.0.4.dist-info/entry_points.txt,sha256=Wp94VV2w9KasBDLaluLM5EnjLgjNOAQVu44wKRDAwmQ,288
|
|
66
|
+
genhpf-1.0.4.dist-info/top_level.txt,sha256=lk846Vmnvydb6UZn8xmowj60nkrZYexNOGGnPM-IbhA,7
|
|
67
|
+
genhpf-1.0.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|