PyPI - slide2vec - Versions diffs - 1.3.0__tar.gz → 2.0.0__tar.gz - Mend

slide2vec 1.3.0tar.gz → 2.0.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

{slide2vec-1.3.0/slide2vec.egg-info → slide2vec-2.0.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: slide2vec
-Version: 1.3.0
+Version: 2.0.0
 Summary: Embedding of whole slide images with Foundation Models
 Home-page: https://github.com/clemsgrs/slide2vec
 Author: Clément Grisi
@@ -95,7 +95,7 @@ pip install slide2vec
    A good starting point is the default configuration file `slide2vec/configs/default.yaml` where parameters are documented.<br>
    We've also added default configuration files for each of the foundation models currently supported:
-   - tile-level: `uni`, `uni2`, `virchow`, `virchow2`, `prov-gigapath`, `h-optimus-0`, `h-optimus-1`
+   - tile-level: `uni`, `uni2`, `virchow`, `virchow2`, `prov-gigapath`, `h-optimus-0`, `h-optimus-1`, `h0-mini`, `conch`, `musk`, `phikonv2`, `hibou-b`, `hibou-L`, [`kaiko`](https://github.com/kaiko-ai/towards_large_pathology_fms)
    - slide-level: `prov-gigapath`, `titan`, `prism`

{slide2vec-1.3.0 → slide2vec-2.0.0}/README.md RENAMED Viewed

@@ -41,7 +41,7 @@ pip install slide2vec
    A good starting point is the default configuration file `slide2vec/configs/default.yaml` where parameters are documented.<br>
    We've also added default configuration files for each of the foundation models currently supported:
-   - tile-level: `uni`, `uni2`, `virchow`, `virchow2`, `prov-gigapath`, `h-optimus-0`, `h-optimus-1`
+   - tile-level: `uni`, `uni2`, `virchow`, `virchow2`, `prov-gigapath`, `h-optimus-0`, `h-optimus-1`, `h0-mini`, `conch`, `musk`, `phikonv2`, `hibou-b`, `hibou-L`, [`kaiko`](https://github.com/kaiko-ai/towards_large_pathology_fms)
    - slide-level: `prov-gigapath`, `titan`, `prism`

{slide2vec-1.3.0 → slide2vec-2.0.0}/pyproject.toml RENAMED Viewed

@@ -23,7 +23,7 @@ warn_unused_configs = true
 no_implicit_reexport = true
 [tool.bumpver]
-current_version = "1.3.0"
+current_version = "2.0.0"
 version_pattern = "MAJOR.MINOR.PATCH"
 commit = false       # We do version bumping in CI, not as a commit
 tag = false          # Git tag already exists — we don't auto-tag

{slide2vec-1.3.0 → slide2vec-2.0.0}/setup.cfg RENAMED Viewed

@@ -1,6 +1,6 @@
 [metadata]
 name = slide2vec
-version = 1.3.0
+version = 2.0.0
 description = Embedding of whole slide images with Foundation Models
 author = Clément Grisi
 platforms = unix, linux, osx, cygwin, win32

slide2vec-2.0.0/slide2vec/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "2.0.0"

{slide2vec-1.3.0 → slide2vec-2.0.0}/slide2vec/aggregate.py RENAMED Viewed

@@ -28,14 +28,20 @@ def get_args_parser(add_help: bool = True):
         "--config-file", default="", metavar="FILE", help="path to config file"
     )
     parser.add_argument(
-        "--run-id",
+        "--output-dir",
         type=str,
-        default="",
-        help="Name of output subdirectory",
+        default=None,
+        help="output directory to save logs and checkpoints",
     )
     parser.add_argument(
         "--run-on-cpu", action="store_true", help="run inference on cpu"
     )
+    parser.add_argument(
+        "opts",
+        help="Modify config options at the end of the command using \"path.key=value\".",
+        default=None,
+        nargs=argparse.REMAINDER,
+    )
     return parser
@@ -54,7 +60,7 @@ def main(args):
     # setup configuration
     run_on_cpu = args.run_on_cpu
     cfg = get_cfg_from_file(args.config_file)
-    output_dir = Path(cfg.output_dir, args.run_id)
+    output_dir = Path(cfg.output_dir, args.output_dir)
     cfg.output_dir = str(output_dir)
     coordinates_dir = Path(cfg.output_dir, "coordinates")
@@ -71,6 +77,11 @@ def main(args):
         process_list.is_file()
     ), "Process list CSV not found. Ensure tiling has been run."
     process_df = pd.read_csv(process_list)
+    if "aggregation_status" not in process_df.columns:
+        process_df["aggregation_status"] = ["tbp"] * len(process_df)
+        cols = ["wsi_name", "wsi_path", "mask_path", "tiling_status", "feature_status", "aggregation_status", "error", "traceback"]
+        process_df = process_df[cols]
     skip_feature_aggregation = process_df["aggregation_status"].str.contains("success").all()
     if skip_feature_aggregation and distributed.is_main_process():
@@ -111,6 +122,9 @@ def main(args):
             coordinates = (np.array([coordinates_arr["x"], coordinates_arr["y"]]).T).astype(int)
             feature_path = features_dir / f"{name}.pt"
+            output_path = features_dir / f"{name}.pt"
+            if cfg.model.save_tile_embeddings:
+                feature_path = features_dir / f"{name}-tiles.pt"
             # run forward pass with slide encoder
             if cfg.model.name == "prov-gigapath":
@@ -132,13 +146,19 @@ def main(args):
                 with autocast_context:
                     features = torch.load(feature_path).to(model.device)
                     tile_size_lv0 = coordinates_arr["tile_size_lv0"][0]
-                    wsi_feature = model.forward_slide(
+                    output = model.forward_slide(
                         features,
                         tile_coordinates=coordinates,
                         tile_size_lv0=tile_size_lv0,
                     )
-            torch.save(wsi_feature, feature_path)
+                    wsi_feature = output["embedding"].cpu()
+                    if cfg.model.name == "prism" and cfg.model.save_latents:
+                        latent_path = features_dir / f"{name}-latents.pt"
+                        latents = output["latents"].cpu()
+                        torch.save(latents, latent_path)
+                        del latents
+            torch.save(wsi_feature, output_path)
             del wsi_feature
             if not run_on_cpu:
                 torch.cuda.empty_cache()

{slide2vec-1.3.0 → slide2vec-2.0.0}/slide2vec/data/dataset.py RENAMED Viewed

@@ -2,6 +2,7 @@ import torch
 import numpy as np
 import wholeslidedata as wsd
+from transformers.image_processing_utils import BaseImageProcessor
 from PIL import Image
 from pathlib import Path
@@ -58,5 +59,8 @@ class TileDataset(torch.utils.data.Dataset):
         if self.tile_size[idx] != self.tile_size_resized[idx]:
             tile = tile.resize((self.tile_size[idx], self.tile_size[idx]))
         if self.transforms:
-            tile = self.transforms(tile)
+            if isinstance(self.transforms, BaseImageProcessor):  # Hugging Face (`transformer`)
+                tile = self.transforms(tile, return_tensors="pt")["pixel_values"].squeeze(0)
+            else:  # general callable such as torchvision transforms
+                tile = self.transforms(tile)
         return idx, tile

{slide2vec-1.3.0 → slide2vec-2.0.0}/slide2vec/embed.py RENAMED Viewed

@@ -28,14 +28,20 @@ def get_args_parser(add_help: bool = True):
         "--config-file", default="", metavar="FILE", help="path to config file"
     )
     parser.add_argument(
-        "--run-id",
+        "--output-dir",
         type=str,
-        default="",
-        help="Name of output subdirectory",
+        default=None,
+        help="output directory to save logs and checkpoints",
     )
     parser.add_argument(
         "--run-on-cpu", action="store_true", help="run inference on cpu"
     )
+    parser.add_argument(
+        "opts",
+        help="Modify config options at the end of the command using \"path.key=value\".",
+        default=None,
+        nargs=argparse.REMAINDER,
+    )
     return parser
@@ -80,7 +86,7 @@ def run_inference(dataloader, model, device, autocast_context, unit, batch_size,
             ):
                 idx, image = batch
                 image = image.to(device, non_blocking=True)
-                feature = model(image).cpu().numpy()
+                feature = model(image)["embedding"].cpu().numpy()
                 features.resize(features.shape[0] + feature.shape[0], axis=0)
                 features[-feature.shape[0]:] = feature
                 indices.resize(indices.shape[0] + idx.shape[0], axis=0)
@@ -123,7 +129,7 @@ def main(args):
     # setup configuration
     run_on_cpu = args.run_on_cpu
     cfg = get_cfg_from_file(args.config_file)
-    output_dir = Path(cfg.output_dir, args.run_id)
+    output_dir = Path(cfg.output_dir, args.output_dir)
     cfg.output_dir = str(output_dir)
     if not run_on_cpu:
@@ -148,6 +154,11 @@ def main(args):
         process_list.is_file()
     ), "Process list CSV not found. Ensure tiling has been run."
     process_df = pd.read_csv(process_list)
+    if "feature_status" not in process_df.columns:
+        process_df["feature_status"] = ["tbp"] * len(process_df)
+        cols = ["wsi_name", "wsi_path", "mask_path", "tiling_status", "feature_status", "error", "traceback"]
+        process_df = process_df[cols]
     skip_feature_extraction = process_df["feature_status"].str.contains("success").all()
     if skip_feature_extraction:
@@ -219,13 +230,15 @@ def main(args):
                 name = wsi_fp.stem.replace(" ", "_")
                 feature_path = features_dir / f"{name}.pt"
+                if cfg.model.save_tile_embeddings:
+                    feature_path = features_dir / f"{name}-tiles.pt"
                 tmp_feature_path = tmp_dir / f"{name}-rank_{distributed.get_global_rank()}.h5"
                 # get feature dimension and dtype using a dry run
                 with torch.inference_mode(), autocast_context:
                     sample_batch = next(iter(dataloader))
                     sample_image = sample_batch[1].to(model.device)
-                    sample_feature = model(sample_image).cpu().numpy()
+                    sample_feature = model(sample_image)["embedding"].cpu().numpy()
                     feature_dim = sample_feature.shape[1:]
                     dtype = sample_feature.dtype

{slide2vec-1.3.0 → slide2vec-2.0.0}/slide2vec/main.py RENAMED Viewed

@@ -24,6 +24,17 @@ def get_args_parser(add_help: bool = True):
     parser.add_argument(
         "--run-on-cpu", action="store_true", help="run inference on cpu"
     )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        help="output directory to save logs and checkpoints",
+    )
+    parser.add_argument(
+        "opts",
+        help="Modify config options at the end of the command using \"path.key=value\".",
+        default=None,
+        nargs=argparse.REMAINDER,
+    )
     return parser
@@ -37,35 +48,26 @@ def log_progress(features_dir: Path, stop_event: threading.Event, log_interval:
         time.sleep(log_interval)
-def run_tiling(config_file, run_id):
-    print("Running tiling.py...")
+def run_tiling(root_dir, config_file, output_dir):
+    print(f"Running tiling.py from {root_dir}...")
     cmd = [
         sys.executable,
-        "slide2vec/tiling.py",
-        "--run-id",
-        run_id,
+        "hs2p/tiling.py",
         "--config-file",
-        config_file,
+        os.path.abspath(config_file),
+        "--output-dir",
+        os.path.abspath(output_dir),
+        "--skip-datetime",
+        "--skip-logging",
+        "wandb.enable=false", # disable wandb to avoid dupliacte logging
     ]
-    proc = subprocess.Popen(
-        cmd,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        text=True,
-        bufsize=1,
-        universal_newlines=True
-    )
-    # forward output in real-time
-    for line in proc.stdout:
-        print(line.rstrip())
-        sys.stdout.flush()
-    proc.wait()
+    proc = subprocess.run(cmd, cwd=root_dir)
     if proc.returncode != 0:
         print("Slide tiling failed. Exiting.")
         sys.exit(proc.returncode)
-def run_feature_extraction(config_file, run_id, run_on_cpu: False):
+def run_feature_extraction(config_file, output_dir, run_on_cpu: False):
     print("Running embed.py...")
     # find a free port
     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
@@ -78,36 +80,24 @@ def run_feature_extraction(config_file, run_id, run_on_cpu: False):
         f"--master_port={free_port}",
         "--nproc_per_node=gpu",
         "slide2vec/embed.py",
-        "--run-id",
-        run_id,
         "--config-file",
-        config_file,
+        os.path.abspath(config_file),
+        "--output-dir",
+        os.path.abspath(output_dir),
     ]
     if run_on_cpu:
         cmd = [
             sys.executable,
             "slide2vec/embed.py",
-            "--run-id",
-            run_id,
             "--config-file",
-            config_file,
+            os.path.abspath(config_file),
+            "--output-dir",
+            os.path.abspath(output_dir),
             "--run-on-cpu",
         ]
     # launch in its own process group.
-    proc = subprocess.Popen(
-        cmd,
-        preexec_fn=os.setsid,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        text=True,
-        bufsize=1,
-        universal_newlines=True
-    )
+    proc = subprocess.Popen(cmd)
     try:
-        # forward output in real-time
-        for line in proc.stdout:
-            print(line.rstrip())
-            sys.stdout.flush()
         proc.wait()
     except KeyboardInterrupt:
         print("Received CTRL+C, terminating embed.py process group...")
@@ -119,34 +109,22 @@ def run_feature_extraction(config_file, run_id, run_on_cpu: False):
         sys.exit(proc.returncode)
-def run_feature_aggregation(config_file, run_id, run_on_cpu: False):
+def run_feature_aggregation(config_file, output_dir, run_on_cpu: False):
     print("Running aggregate.py...")
     # find a free port
     cmd = [
         sys.executable,
         "slide2vec/aggregate.py",
-        "--run-id",
-        run_id,
         "--config-file",
-        config_file,
+        os.path.abspath(config_file),
+        "--output-dir",
+        os.path.abspath(output_dir),
     ]
     if run_on_cpu:
         cmd.append("--run-on-cpu")
     # launch in its own process group.
-    proc = subprocess.Popen(
-        cmd,
-        preexec_fn=os.setsid,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        text=True,
-        bufsize=1,
-        universal_newlines=True
-    )
+    proc = subprocess.Popen(cmd)
     try:
-        # forward output in real-time
-        for line in proc.stdout:
-            print(line.rstrip())
-            sys.stdout.flush()
         proc.wait()
     except KeyboardInterrupt:
         print("Received CTRL+C, terminating aggregate.py process group...")
@@ -159,19 +137,19 @@ def run_feature_aggregation(config_file, run_id, run_on_cpu: False):
 def main(args):
-    config_file = args.config_file
-    skip_datetime = args.skip_datetime
     run_on_cpu = args.run_on_cpu
-    cfg, run_id = setup(config_file, skip_datetime=skip_datetime)
+    cfg, cfg_path = setup(args)
+    output_dir = Path(cfg.output_dir)
     hf_login()
-    run_tiling(config_file, run_id)
+    root_dir = "slide2vec/hs2p"
+    run_tiling(root_dir, cfg_path, output_dir)
     print("Tiling completed.")
     print("=+=" * 10)
-    output_dir = Path(cfg.output_dir)
     features_dir = output_dir / "features"
     if cfg.wandb.enable:
         stop_event = threading.Event()
@@ -180,10 +158,10 @@ def main(args):
         )
         log_thread.start()
-    run_feature_extraction(config_file, run_id, run_on_cpu)
+    run_feature_extraction(cfg_path, output_dir, run_on_cpu)
     if cfg.model.level == "slide":
-        run_feature_aggregation(config_file, run_id, run_on_cpu)
+        run_feature_aggregation(cfg_path, output_dir, run_on_cpu)
         print("Feature extraction completed.")
         print("=+=" * 10)
     else:
@@ -203,9 +181,9 @@ if __name__ == "__main__":
     import warnings
     import torchvision
     torchvision.disable_beta_transforms_warning()
     warnings.filterwarnings("ignore", message=".*Could not set the permissions.*")
     warnings.filterwarnings("ignore", message=".*antialias.*", category=UserWarning)
     warnings.filterwarnings("ignore", message=".*TypedStorage.*", category=UserWarning)

slide2vec 1.3.0__tar.gz → 2.0.0__tar.gz

slide2vec 1.3.0tar.gz → 2.0.0tar.gz