genhpf 1.0.2__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of genhpf might be problematic. Click here for more details.

@@ -151,7 +151,10 @@ def get_parser():
151
151
  return parser
152
152
 
153
153
 
154
- def main(args):
154
+ def main():
155
+ parser = get_parser()
156
+ args = parser.parse_args()
157
+
155
158
  if not os.path.exists(args.dest):
156
159
  os.makedirs(args.dest)
157
160
 
@@ -169,6 +172,4 @@ def main(args):
169
172
 
170
173
 
171
174
  if __name__ == "__main__":
172
- parser = get_parser()
173
- args = parser.parse_args()
174
- main(args)
175
+ main()
@@ -71,6 +71,12 @@ def get_parser():
71
71
  default="outputs",
72
72
  help="directory to save processed outputs.",
73
73
  )
74
+ parser.add_argument(
75
+ "--skip-if-exists",
76
+ action="store_true",
77
+ help="whether or not to skip the processing if the output directory already "
78
+ "exists.",
79
+ )
74
80
  parser.add_argument(
75
81
  "--rebase",
76
82
  action="store_true",
@@ -101,23 +107,16 @@ def get_parser():
101
107
  return parser
102
108
 
103
109
 
104
- def main(args):
110
+ def main():
111
+ parser = get_parser()
112
+ args = parser.parse_args()
113
+
105
114
  root_path = Path(args.root)
106
115
  output_dir = Path(args.output_dir)
107
116
  metadata_dir = Path(args.metadata_dir)
108
117
  mimic_dir = Path(args.mimic_dir) if args.mimic_dir is not None else None
109
118
 
110
- if not output_dir.exists():
111
- output_dir.mkdir()
112
- else:
113
- if args.rebase:
114
- shutil.rmtree(output_dir)
115
- if output_dir.exists():
116
- raise ValueError(
117
- f"File exists: '{str(output_dir.resolve())}'. If you want to rebase the "
118
- "directory, please run the script with --rebase."
119
- )
120
- output_dir.mkdir()
119
+ num_workers = max(args.workers, 1)
121
120
 
122
121
  if root_path.is_dir():
123
122
  data_paths = glob.glob(str(root_path / "**/*.csv"), recursive=True)
@@ -128,6 +127,34 @@ def main(args):
128
127
  else:
129
128
  data_paths = [root_path]
130
129
 
130
+ if not output_dir.exists():
131
+ output_dir.mkdir()
132
+ else:
133
+ if args.rebase:
134
+ shutil.rmtree(output_dir)
135
+ if output_dir.exists():
136
+ if args.skip_if_exists:
137
+ ls = glob.glob(str(output_dir / "**/*"), recursive=True)
138
+ expected_files = []
139
+ for subset in set(os.path.dirname(x) for x in data_paths):
140
+ expected_files.extend([
141
+ os.path.join(str(output_dir), os.path.basename(subset), f"{i}.h5")
142
+ for i in range(num_workers)
143
+ ])
144
+ if set(expected_files).issubset(set(ls)):
145
+ print(
146
+ f"Output directory already contains the expected files. Skipping the "
147
+ "processing as --skip-if-exists is set. If you want to rebase the directory, "
148
+ "please run the script with --rebase."
149
+ )
150
+ return
151
+ else:
152
+ raise ValueError(
153
+ f"File exists: '{str(output_dir.resolve())}'. If you want to rebase the "
154
+ "directory, please run the script with --rebase."
155
+ )
156
+ output_dir.mkdir()
157
+
131
158
  label_col_name = args.cohort_label_name
132
159
 
133
160
  tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
@@ -295,7 +322,7 @@ def main(args):
295
322
  codes_metadata,
296
323
  output_dir,
297
324
  output_name,
298
- args.workers,
325
+ num_workers,
299
326
  d_items,
300
327
  d_labitems,
301
328
  warned_codes,
@@ -304,25 +331,25 @@ def main(args):
304
331
 
305
332
  # meds --> remed
306
333
  print("Processing...")
307
- if args.workers <= 1:
334
+ if num_workers <= 1:
308
335
  length_per_subject_gathered = [meds_to_remed_partial(data)]
309
336
  del data
310
337
  else:
311
338
  subject_ids = data["subject_id"].unique().to_list()
312
- n = args.workers
339
+ n = num_workers
313
340
  subject_id_chunks = [subject_ids[i::n] for i in range(n)]
314
341
  data_chunks = []
315
342
  for subject_id_chunk in subject_id_chunks:
316
343
  data_chunks.append(data.filter(pl.col("subject_id").is_in(subject_id_chunk)))
317
344
  del data
318
- pool = multiprocessing.get_context("spawn").Pool(processes=args.workers)
345
+ pool = multiprocessing.get_context("spawn").Pool(processes=num_workers)
319
346
  # the order is preserved
320
347
  length_per_subject_gathered = pool.map(meds_to_remed_partial, data_chunks)
321
348
  pool.close()
322
349
  pool.join()
323
350
  del data_chunks
324
351
 
325
- if len(length_per_subject_gathered) != args.workers:
352
+ if len(length_per_subject_gathered) != num_workers:
326
353
  print(
327
354
  "Number of processed workers were smaller than the specified num workers "
328
355
  "(--workers) due to the small size of data. Consider reducing the number of "
@@ -579,6 +606,4 @@ def meds_to_remed(
579
606
 
580
607
 
581
608
  if __name__ == "__main__":
582
- parser = get_parser()
583
- args = parser.parse_args()
584
- main(args)
609
+ main()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: genhpf
3
- Version: 1.0.2
3
+ Version: 1.0.3
4
4
  Summary: GenHPF: General Healthcare Predictive Framework with Multi-task Multi-source Learning
5
5
  Author-email: Jungwoo Oh <ojw0123@kaist.ac.kr>, Kyunghoon Hur <pacesun@kaist.ac.kr>
6
6
  License: MIT license
@@ -40,10 +40,10 @@ genhpf/scripts/test.py,sha256=wWi7OLqxsW9blj21m3RTirvziQ5UpkjkngOkgkE3Vb4,10149
40
40
  genhpf/scripts/train.py,sha256=5f5PYOkiW7BahbFArvdOguAzUdDnY4Urw7Nx3aJ4kjs,12488
41
41
  genhpf/scripts/preprocess/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
42
  genhpf/scripts/preprocess/manifest.py,sha256=ZIK16e4vs_cS2K_tM1GaT38hc1nBHk6JB9Uga6OjgU4,2711
43
- genhpf/scripts/preprocess/preprocess_meds.py,sha256=CBdm9OHF3e3Ado7QSZLtPa3nIz2gnBCl1dGW2y_0aDg,22770
43
+ genhpf/scripts/preprocess/preprocess_meds.py,sha256=g0P5IUB-Oby9msIvuBv8MBb0ISNtmgqn_2ET5v0G9C4,23850
44
44
  genhpf/scripts/preprocess/genhpf/README.md,sha256=qtpM_ABJk5yI8xbsUj1sZ71yX5bybx9ZvAymo0Lh5Vc,2877
45
45
  genhpf/scripts/preprocess/genhpf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
46
- genhpf/scripts/preprocess/genhpf/main.py,sha256=2OnTAw6AGS5wsH_Lxv_-0uMyMOVGyfAmoFNqv7Z8GGY,6168
46
+ genhpf/scripts/preprocess/genhpf/main.py,sha256=EF3sce0ltowMHIGK7zLEQEOnzOWQ_WJxoBowknHV3mQ,6161
47
47
  genhpf/scripts/preprocess/genhpf/manifest.py,sha256=uHx0POSs9-ZB8Vtib7rPJ6hgDVJ1CBN6Ccfa4PpqmnM,2663
48
48
  genhpf/scripts/preprocess/genhpf/sample_dataset.py,sha256=JzjMY2ynIYoWWtRlBG9Hxv6EoF27jJHyd3VYfqsM0Xs,5569
49
49
  genhpf/scripts/preprocess/genhpf/ehrs/__init__.py,sha256=8bA4Pk0ylLIpwFQKEx6lis0k_inh4owF2SlHjHhKkeE,895
@@ -59,9 +59,9 @@ genhpf/utils/distributed_utils.py,sha256=000xKlw8SLoSH16o6n2bB3eueGR0aVD_DufPYES
59
59
  genhpf/utils/file_io.py,sha256=hnZXdMtAibfFDoIfn-SDusl-v7ZImeUEh0eD2MIxbG4,4919
60
60
  genhpf/utils/pdb.py,sha256=400rk1pVfOpVpzKIFHnTRlZ2VCtBqRh9G-pRRwu2Oqo,930
61
61
  genhpf/utils/utils.py,sha256=BoC_7Gz8uCHbUBCpcXGBMD-5irApi_6xM7nU-2ac4aA,6176
62
- genhpf-1.0.2.dist-info/LICENSE,sha256=VK_rvhY2Xi_DAIZHtauni5O9-1_do5SNWjrskv4amg8,1065
63
- genhpf-1.0.2.dist-info/METADATA,sha256=sAIt6zSLc78ngfOSonQgFO4o9IPE9ALEdRVFKLcPFsI,10558
64
- genhpf-1.0.2.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
65
- genhpf-1.0.2.dist-info/entry_points.txt,sha256=Wp94VV2w9KasBDLaluLM5EnjLgjNOAQVu44wKRDAwmQ,288
66
- genhpf-1.0.2.dist-info/top_level.txt,sha256=lk846Vmnvydb6UZn8xmowj60nkrZYexNOGGnPM-IbhA,7
67
- genhpf-1.0.2.dist-info/RECORD,,
62
+ genhpf-1.0.3.dist-info/LICENSE,sha256=VK_rvhY2Xi_DAIZHtauni5O9-1_do5SNWjrskv4amg8,1065
63
+ genhpf-1.0.3.dist-info/METADATA,sha256=6_P3-r3RlihHDK5vxM9ae39OO5eTQlVP2iAyMH-xgZY,10558
64
+ genhpf-1.0.3.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
65
+ genhpf-1.0.3.dist-info/entry_points.txt,sha256=Wp94VV2w9KasBDLaluLM5EnjLgjNOAQVu44wKRDAwmQ,288
66
+ genhpf-1.0.3.dist-info/top_level.txt,sha256=lk846Vmnvydb6UZn8xmowj60nkrZYexNOGGnPM-IbhA,7
67
+ genhpf-1.0.3.dist-info/RECORD,,
File without changes