PyPI - genhpf - Versions diffs - 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl - Mend

genhpf 1.0.2py3-none-any.whl → 1.0.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of genhpf might be problematic. Click here for more details.

Files changed (8) hide show

genhpf/scripts/preprocess/genhpf/main.py CHANGED Viewed

@@ -151,7 +151,10 @@ def get_parser():
     return parser
-def main(args):
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
     if not os.path.exists(args.dest):
         os.makedirs(args.dest)
@@ -169,6 +172,4 @@ def main(args):
 if __name__ == "__main__":
-    parser = get_parser()
-    args = parser.parse_args()
-    main(args)
+    main()

genhpf/scripts/preprocess/preprocess_meds.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import functools
+import logging
 import glob
 import multiprocessing
 import os
 import re
 import shutil
-import warnings
 from argparse import ArgumentParser
 from bisect import bisect_left, bisect_right
 from datetime import datetime
@@ -17,6 +17,9 @@ import polars as pl
 from tqdm import tqdm
 from transformers import AutoTokenizer
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
 pool_manager = multiprocessing.Manager()
 warned_codes = pool_manager.list()
@@ -71,11 +74,24 @@ def get_parser():
         default="outputs",
         help="directory to save processed outputs.",
     )
+    parser.add_argument(
+        "--skip-if-exists",
+        action="store_true",
+        help="whether or not to skip the processing if the output directory already "
+        "exists.",
+    )
     parser.add_argument(
         "--rebase",
         action="store_true",
         help="whether or not to rebase the output directory if exists.",
     )
+    parser.add_argument(
+        "--debug",
+        type=bool,
+        default=False,
+        help="whether or not to enable the debug mode, which forces the script to be run with "
+        "only one worker."
+    )
     parser.add_argument(
         "--workers",
         metavar="N",
@@ -101,23 +117,26 @@ def get_parser():
     return parser
-def main(args):
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
     root_path = Path(args.root)
     output_dir = Path(args.output_dir)
     metadata_dir = Path(args.metadata_dir)
     mimic_dir = Path(args.mimic_dir) if args.mimic_dir is not None else None
-    if not output_dir.exists():
-        output_dir.mkdir()
+    num_workers = max(args.workers, 1)
+    if args.debug:
+        num_workers = 1
     else:
-        if args.rebase:
-            shutil.rmtree(output_dir)
-        if output_dir.exists():
-            raise ValueError(
-                f"File exists: '{str(output_dir.resolve())}'. If you want to rebase the "
-                "directory, please run the script with --rebase."
+        cpu_count = multiprocessing.cpu_count()
+        if num_workers > cpu_count:
+            logger.warning(
+                f"Number of workers (--workers) is greater than the number of available CPUs "
+                f"({cpu_count}). Setting the number of workers to {cpu_count}."
             )
-        output_dir.mkdir()
+            num_workers = cpu_count
     if root_path.is_dir():
         data_paths = glob.glob(str(root_path / "**/*.csv"), recursive=True)
@@ -128,6 +147,34 @@ def main(args):
     else:
         data_paths = [root_path]
+    if not output_dir.exists():
+        output_dir.mkdir()
+    else:
+        if args.rebase:
+            shutil.rmtree(output_dir)
+            output_dir.mkdir()
+        elif output_dir.exists():
+            if args.skip_if_exists:
+                ls = glob.glob(str(output_dir / "**/*"), recursive=True)
+                expected_files = []
+                for subset in set(os.path.dirname(x) for x in data_paths):
+                    expected_files.extend([
+                        os.path.join(str(output_dir), os.path.basename(subset), f"{i}.h5")
+                        for i in range(num_workers)
+                    ])
+                if set(expected_files).issubset(set(ls)):
+                    logger.info(
+                        f"Output directory already contains the expected files. Skipping the "
+                        "processing as --skip-if-exists is set. If you want to rebase the directory, "
+                        "please run the script with --rebase."
+                    )
+                    return
+            else:
+                raise ValueError(
+                    f"File exists: '{str(output_dir.resolve())}'. If you want to rebase the "
+                    "directory automatically, please run the script with --rebase."
+                )
     label_col_name = args.cohort_label_name
     tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
@@ -295,7 +342,7 @@ def main(args):
                 codes_metadata,
                 output_dir,
                 output_name,
-                args.workers,
+                num_workers,
                 d_items,
                 d_labitems,
                 warned_codes,
@@ -303,27 +350,36 @@ def main(args):
             )
             # meds --> remed
-            print("Processing...")
-            if args.workers <= 1:
+            logger.info(f"Start processing {data_path}")
+            if num_workers <= 1:
                 length_per_subject_gathered = [meds_to_remed_partial(data)]
                 del data
             else:
                 subject_ids = data["subject_id"].unique().to_list()
-                n = args.workers
+                n = num_workers
                 subject_id_chunks = [subject_ids[i::n] for i in range(n)]
                 data_chunks = []
                 for subject_id_chunk in subject_id_chunks:
                     data_chunks.append(data.filter(pl.col("subject_id").is_in(subject_id_chunk)))
                 del data
-                pool = multiprocessing.get_context("spawn").Pool(processes=args.workers)
+                num_valid_data_chunks = sum(map(lambda x: len(x) > 0, data_chunks))
+                if num_valid_data_chunks < num_workers:
+                    raise ValueError(
+                        "Number of valid data chunks (= number of unique subjects) were smaller "
+                        "than the specified num workers (--workers) due to the small size of data. "
+                        "Consider reducing the number of workers."
+                    )
+                pool = multiprocessing.get_context("spawn").Pool(processes=num_workers)
                 # the order is preserved
                 length_per_subject_gathered = pool.map(meds_to_remed_partial, data_chunks)
                 pool.close()
                 pool.join()
                 del data_chunks
-            if len(length_per_subject_gathered) != args.workers:
-                print(
+            if len(length_per_subject_gathered) != num_workers:
+                raise ValueError(
                     "Number of processed workers were smaller than the specified num workers "
                     "(--workers) due to the small size of data. Consider reducing the number of "
                     "workers."
@@ -393,7 +449,7 @@ def meds_to_remed(
                             if do_break and col_event not in warned_codes:
                                 warned_codes.append(col_event)
-                                warnings.warn(
+                                logger.warning(
                                     "The dataset contains some codes that are not specified in "
                                     "the codes metadata, which may not be intended. Note that we "
                                     f"process this code as it is for now: {col_event}."
@@ -579,6 +635,4 @@ def meds_to_remed(
 if __name__ == "__main__":
-    parser = get_parser()
-    args = parser.parse_args()
-    main(args)
+    main()

{genhpf-1.0.2.dist-info → genhpf-1.0.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: genhpf
-Version: 1.0.2
+Version: 1.0.4
 Summary: GenHPF: General Healthcare Predictive Framework with Multi-task Multi-source Learning
 Author-email: Jungwoo Oh <ojw0123@kaist.ac.kr>, Kyunghoon Hur <pacesun@kaist.ac.kr>
 License: MIT license
@@ -18,6 +18,7 @@ Requires-Dist: h5pickle==0.4.2
 Requires-Dist: scikit-learn==1.6.1
 Requires-Dist: pandas==2.2.3
 Requires-Dist: polars==1.17.1
+Requires-Dist: pyarrow==17.0.0
 Provides-Extra: dev
 Requires-Dist: pre-commit; extra == "dev"
 Requires-Dist: black; extra == "dev"

{genhpf-1.0.2.dist-info → genhpf-1.0.4.dist-info}/RECORD RENAMED Viewed

@@ -40,10 +40,10 @@ genhpf/scripts/test.py,sha256=wWi7OLqxsW9blj21m3RTirvziQ5UpkjkngOkgkE3Vb4,10149
 genhpf/scripts/train.py,sha256=5f5PYOkiW7BahbFArvdOguAzUdDnY4Urw7Nx3aJ4kjs,12488
 genhpf/scripts/preprocess/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 genhpf/scripts/preprocess/manifest.py,sha256=ZIK16e4vs_cS2K_tM1GaT38hc1nBHk6JB9Uga6OjgU4,2711
-genhpf/scripts/preprocess/preprocess_meds.py,sha256=CBdm9OHF3e3Ado7QSZLtPa3nIz2gnBCl1dGW2y_0aDg,22770
+genhpf/scripts/preprocess/preprocess_meds.py,sha256=x5R4KDzzB-21IUHjfkyo4-Be1t9U4oaurjm5VhxZ5Rw,25050
 genhpf/scripts/preprocess/genhpf/README.md,sha256=qtpM_ABJk5yI8xbsUj1sZ71yX5bybx9ZvAymo0Lh5Vc,2877
 genhpf/scripts/preprocess/genhpf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-genhpf/scripts/preprocess/genhpf/main.py,sha256=2OnTAw6AGS5wsH_Lxv_-0uMyMOVGyfAmoFNqv7Z8GGY,6168
+genhpf/scripts/preprocess/genhpf/main.py,sha256=EF3sce0ltowMHIGK7zLEQEOnzOWQ_WJxoBowknHV3mQ,6161
 genhpf/scripts/preprocess/genhpf/manifest.py,sha256=uHx0POSs9-ZB8Vtib7rPJ6hgDVJ1CBN6Ccfa4PpqmnM,2663
 genhpf/scripts/preprocess/genhpf/sample_dataset.py,sha256=JzjMY2ynIYoWWtRlBG9Hxv6EoF27jJHyd3VYfqsM0Xs,5569
 genhpf/scripts/preprocess/genhpf/ehrs/__init__.py,sha256=8bA4Pk0ylLIpwFQKEx6lis0k_inh4owF2SlHjHhKkeE,895
@@ -59,9 +59,9 @@ genhpf/utils/distributed_utils.py,sha256=000xKlw8SLoSH16o6n2bB3eueGR0aVD_DufPYES
 genhpf/utils/file_io.py,sha256=hnZXdMtAibfFDoIfn-SDusl-v7ZImeUEh0eD2MIxbG4,4919
 genhpf/utils/pdb.py,sha256=400rk1pVfOpVpzKIFHnTRlZ2VCtBqRh9G-pRRwu2Oqo,930
 genhpf/utils/utils.py,sha256=BoC_7Gz8uCHbUBCpcXGBMD-5irApi_6xM7nU-2ac4aA,6176
-genhpf-1.0.2.dist-info/LICENSE,sha256=VK_rvhY2Xi_DAIZHtauni5O9-1_do5SNWjrskv4amg8,1065
-genhpf-1.0.2.dist-info/METADATA,sha256=sAIt6zSLc78ngfOSonQgFO4o9IPE9ALEdRVFKLcPFsI,10558
-genhpf-1.0.2.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
-genhpf-1.0.2.dist-info/entry_points.txt,sha256=Wp94VV2w9KasBDLaluLM5EnjLgjNOAQVu44wKRDAwmQ,288
-genhpf-1.0.2.dist-info/top_level.txt,sha256=lk846Vmnvydb6UZn8xmowj60nkrZYexNOGGnPM-IbhA,7
-genhpf-1.0.2.dist-info/RECORD,,
+genhpf-1.0.4.dist-info/LICENSE,sha256=VK_rvhY2Xi_DAIZHtauni5O9-1_do5SNWjrskv4amg8,1065
+genhpf-1.0.4.dist-info/METADATA,sha256=Mgs4WysCKfBf4E2Jik2BgMdZPM8w1-rL5NoqgeqM5Zo,10589
+genhpf-1.0.4.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
+genhpf-1.0.4.dist-info/entry_points.txt,sha256=Wp94VV2w9KasBDLaluLM5EnjLgjNOAQVu44wKRDAwmQ,288
+genhpf-1.0.4.dist-info/top_level.txt,sha256=lk846Vmnvydb6UZn8xmowj60nkrZYexNOGGnPM-IbhA,7
+genhpf-1.0.4.dist-info/RECORD,,

{genhpf-1.0.2.dist-info → genhpf-1.0.4.dist-info}/LICENSE RENAMED Viewed

File without changes

{genhpf-1.0.2.dist-info → genhpf-1.0.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{genhpf-1.0.2.dist-info → genhpf-1.0.4.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{genhpf-1.0.2.dist-info → genhpf-1.0.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

genhpf 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl

Potentially problematic release.

genhpf 1.0.2py3-none-any.whl → 1.0.4py3-none-any.whl