PyPI - genhpf - Versions diffs - 1.0.3__tar.gz → 1.0.4__tar.gz - Mend

genhpf 1.0.3tar.gz → 1.0.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of genhpf might be problematic. Click here for more details.

Files changed (85) hide show

{genhpf-1.0.3/src/genhpf.egg-info → genhpf-1.0.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: genhpf
-Version: 1.0.3
+Version: 1.0.4
 Summary: GenHPF: General Healthcare Predictive Framework with Multi-task Multi-source Learning
 Author-email: Jungwoo Oh <ojw0123@kaist.ac.kr>, Kyunghoon Hur <pacesun@kaist.ac.kr>
 License: MIT license
@@ -18,6 +18,7 @@ Requires-Dist: h5pickle==0.4.2
 Requires-Dist: scikit-learn==1.6.1
 Requires-Dist: pandas==2.2.3
 Requires-Dist: polars==1.17.1
+Requires-Dist: pyarrow==17.0.0
 Provides-Extra: dev
 Requires-Dist: pre-commit; extra == "dev"
 Requires-Dist: black; extra == "dev"

{genhpf-1.0.3 → genhpf-1.0.4}/examples/test/genhpf/meds_hierarchical.yaml RENAMED Viewed

@@ -10,12 +10,7 @@ meds:
   output_dir: ???
 checkpoint:
-  save_dir: checkpoints
-  best_checkpoint_metric: auroc
-  maximize_best_checkpoint_metric: true
-  save_interval: 1
-  keep_last_epochs: 5
-  patience: 10
+  load_checkpoint: ???
 dataset:
   data_format: meds
@@ -44,10 +39,6 @@ criterion:
   num_labels:
     - 1
-optimization:
-  max_epoch: 100
-  lr: 1e-4
 model:
   _name: genhpf_predictor

{genhpf-1.0.3 → genhpf-1.0.4}/pyproject.toml RENAMED Viewed

@@ -29,6 +29,7 @@ dependencies = [
     "scikit-learn==1.6.1",
     "pandas==2.2.3",
     "polars==1.17.1",
+    "pyarrow==17.0.0",
 ]
 [tool.setuptools_scm]

{genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf/scripts/preprocess/preprocess_meds.py RENAMED Viewed

@@ -1,10 +1,10 @@
 import functools
+import logging
 import glob
 import multiprocessing
 import os
 import re
 import shutil
-import warnings
 from argparse import ArgumentParser
 from bisect import bisect_left, bisect_right
 from datetime import datetime
@@ -17,6 +17,9 @@ import polars as pl
 from tqdm import tqdm
 from transformers import AutoTokenizer
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
 pool_manager = multiprocessing.Manager()
 warned_codes = pool_manager.list()
@@ -82,6 +85,13 @@ def get_parser():
         action="store_true",
         help="whether or not to rebase the output directory if exists.",
     )
+    parser.add_argument(
+        "--debug",
+        type=bool,
+        default=False,
+        help="whether or not to enable the debug mode, which forces the script to be run with "
+        "only one worker."
+    )
     parser.add_argument(
         "--workers",
         metavar="N",
@@ -117,6 +127,16 @@ def main():
     mimic_dir = Path(args.mimic_dir) if args.mimic_dir is not None else None
     num_workers = max(args.workers, 1)
+    if args.debug:
+        num_workers = 1
+    else:
+        cpu_count = multiprocessing.cpu_count()
+        if num_workers > cpu_count:
+            logger.warning(
+                f"Number of workers (--workers) is greater than the number of available CPUs "
+                f"({cpu_count}). Setting the number of workers to {cpu_count}."
+            )
+            num_workers = cpu_count
     if root_path.is_dir():
         data_paths = glob.glob(str(root_path / "**/*.csv"), recursive=True)
@@ -132,7 +152,8 @@ def main():
     else:
         if args.rebase:
             shutil.rmtree(output_dir)
-        if output_dir.exists():
+            output_dir.mkdir()
+        elif output_dir.exists():
             if args.skip_if_exists:
                 ls = glob.glob(str(output_dir / "**/*"), recursive=True)
                 expected_files = []
@@ -142,7 +163,7 @@ def main():
                         for i in range(num_workers)
                     ])
                 if set(expected_files).issubset(set(ls)):
-                    print(
+                    logger.info(
                         f"Output directory already contains the expected files. Skipping the "
                         "processing as --skip-if-exists is set. If you want to rebase the directory, "
                         "please run the script with --rebase."
@@ -151,9 +172,8 @@ def main():
             else:
                 raise ValueError(
                     f"File exists: '{str(output_dir.resolve())}'. If you want to rebase the "
-                    "directory, please run the script with --rebase."
+                    "directory automatically, please run the script with --rebase."
                 )
-        output_dir.mkdir()
     label_col_name = args.cohort_label_name
@@ -330,7 +350,7 @@ def main():
             )
             # meds --> remed
-            print("Processing...")
+            logger.info(f"Start processing {data_path}")
             if num_workers <= 1:
                 length_per_subject_gathered = [meds_to_remed_partial(data)]
                 del data
@@ -342,6 +362,15 @@ def main():
                 for subject_id_chunk in subject_id_chunks:
                     data_chunks.append(data.filter(pl.col("subject_id").is_in(subject_id_chunk)))
                 del data
+                num_valid_data_chunks = sum(map(lambda x: len(x) > 0, data_chunks))
+                if num_valid_data_chunks < num_workers:
+                    raise ValueError(
+                        "Number of valid data chunks (= number of unique subjects) were smaller "
+                        "than the specified num workers (--workers) due to the small size of data. "
+                        "Consider reducing the number of workers."
+                    )
                 pool = multiprocessing.get_context("spawn").Pool(processes=num_workers)
                 # the order is preserved
                 length_per_subject_gathered = pool.map(meds_to_remed_partial, data_chunks)
@@ -350,7 +379,7 @@ def main():
                 del data_chunks
             if len(length_per_subject_gathered) != num_workers:
-                print(
+                raise ValueError(
                     "Number of processed workers were smaller than the specified num workers "
                     "(--workers) due to the small size of data. Consider reducing the number of "
                     "workers."
@@ -420,7 +449,7 @@ def meds_to_remed(
                             if do_break and col_event not in warned_codes:
                                 warned_codes.append(col_event)
-                                warnings.warn(
+                                logger.warning(
                                     "The dataset contains some codes that are not specified in "
                                     "the codes metadata, which may not be intended. Note that we "
                                     f"process this code as it is for now: {col_event}."

{genhpf-1.0.3 → genhpf-1.0.4/src/genhpf.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: genhpf
-Version: 1.0.3
+Version: 1.0.4
 Summary: GenHPF: General Healthcare Predictive Framework with Multi-task Multi-source Learning
 Author-email: Jungwoo Oh <ojw0123@kaist.ac.kr>, Kyunghoon Hur <pacesun@kaist.ac.kr>
 License: MIT license
@@ -18,6 +18,7 @@ Requires-Dist: h5pickle==0.4.2
 Requires-Dist: scikit-learn==1.6.1
 Requires-Dist: pandas==2.2.3
 Requires-Dist: polars==1.17.1
+Requires-Dist: pyarrow==17.0.0
 Provides-Extra: dev
 Requires-Dist: pre-commit; extra == "dev"
 Requires-Dist: black; extra == "dev"

{genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf.egg-info/SOURCES.txt RENAMED Viewed

@@ -3,7 +3,6 @@
 LICENSE
 README.md
 pyproject.toml
-requirements.txt
 examples/pretrain/mlm/genhpf/flattened_pt.yaml
 examples/pretrain/simclr/genhpf/genhpf_hierarchical_pt.yaml
 examples/pretrain/wav2vec2/genhpf/hierarchical_pt.yaml

{genhpf-1.0.3 → genhpf-1.0.4}/src/genhpf.egg-info/requires.txt RENAMED Viewed

@@ -6,6 +6,7 @@ h5pickle==0.4.2
 scikit-learn==1.6.1
 pandas==2.2.3
 polars==1.17.1
+pyarrow==17.0.0
 [dev]
 pre-commit

genhpf-1.0.3/requirements.txt DELETED Viewed

@@ -1,48 +0,0 @@
-appdirs==1.4.4
-axial-positional-embedding==0.2.1
-cachetools==5.3.1
-certifi==2023.5.7
-charset-normalizer==3.1.0
-click==8.1.3
-docker-pycreds==0.4.0
-einops==0.6.1
-filelock==3.12.0
-fsspec==2023.5.0
-gitdb==4.0.10
-GitPython==3.1.31
-h5pickle==0.4.2
-h5py==3.8.0
-huggingface-hub==0.14.1
-idna==3.4
-joblib==1.2.0
-local-attention==1.8.6
-numpy==1.24.3
-packaging==23.1
-pandas==2.0.2
-pathtools==0.1.2
-performer-pytorch==1.1.4
-Pillow==9.5.0
-protobuf==4.23.2
-psutil==5.9.5
-python-dateutil==2.8.2
-pytz==2023.3
-PyYAML==6.0
-regex==2023.5.5
-requests==2.31.0
-scikit-learn==1.2.2
-scipy==1.10.1
-sentry-sdk==1.24.0
-setproctitle==1.3.2
-six==1.16.0
-smmap==5.0.0
-threadpoolctl==3.1.0
-tokenizers==0.13.3
-tqdm==4.65.0
-transformers==4.29.2
-typing_extensions==4.6.2
-tzdata==2023.3
-urllib3==1.26.16
-wandb==0.15.3
-pyspark==3.5.0
-pyarrow==14.0.1
-treelib==1.7.0