PyPI - scdataloader - Versions diffs - 1.9.1__py3-none-any.whl → 2.0.0__py3-none-any.whl - Mend

scdataloader 1.9.1py3-none-any.whl → 2.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

scdataloader/__init__.py +2 -1
scdataloader/collator.py +30 -42
scdataloader/config.py +25 -9
scdataloader/data.json +384 -0
scdataloader/data.py +116 -43
scdataloader/datamodule.py +555 -225
scdataloader/mapped.py +84 -18
scdataloader/preprocess.py +108 -94
scdataloader/utils.py +39 -33
{scdataloader-1.9.1.dist-info → scdataloader-2.0.0.dist-info}/METADATA +13 -5
scdataloader-2.0.0.dist-info/RECORD +16 -0
scdataloader-2.0.0.dist-info/licenses/LICENSE +21 -0
scdataloader/VERSION +0 -1
scdataloader-1.9.1.dist-info/RECORD +0 -16
scdataloader-1.9.1.dist-info/licenses/LICENSE +0 -674
{scdataloader-1.9.1.dist-info → scdataloader-2.0.0.dist-info}/WHEEL +0 -0
{scdataloader-1.9.1.dist-info → scdataloader-2.0.0.dist-info}/entry_points.txt +0 -0

scdataloader/datamodule.py CHANGED Viewed

@@ -1,4 +1,9 @@
+import multiprocessing as mp
 import os
+import random
+import time
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from functools import partial
 from typing import Optional, Sequence, Union
 import lamindb as ln
@@ -13,10 +18,11 @@ from torch.utils.data.sampler import (
     SubsetRandomSampler,
     WeightedRandomSampler,
 )
+from tqdm import tqdm
 from .collator import Collator
 from .data import Dataset
-from .utils import getBiomartTable, slurm_restart_count
+from .utils import fileToList, getBiomartTable, listToFile
 FILE_DIR = os.path.dirname(os.path.abspath(__file__))
@@ -26,9 +32,8 @@ class DataModule(L.LightningDataModule):
         self,
         collection_name: str,
         clss_to_weight: list = ["organism_ontology_term_id"],
-        organisms: list = ["NCBITaxon:9606"],
         weight_scaler: int = 10,
-        train_oversampling_per_epoch: float = 0.1,
+        n_samples_per_epoch: int = 2_000_000,
         validation_split: float = 0.2,
         test_split: float = 0,
         gene_embeddings: str = "",
@@ -43,7 +48,7 @@ class DataModule(L.LightningDataModule):
         max_len: int = 1000,
         add_zero_genes: int = 100,
         replacement: bool = True,
-        do_gene_pos: Union[bool, str] = True,
+        do_gene_pos: str = "",
         tp_name: Optional[str] = None,  # "heat_diff"
         assays_to_drop: list = [
             # "EFO:0008853", #patch seq
@@ -53,7 +58,10 @@ class DataModule(L.LightningDataModule):
         ],
         metacell_mode: float = 0.0,
         get_knn_cells: bool = False,
-        modify_seed_on_requeue: bool = True,
+        store_location: str = None,
+        force_recompute_indices: bool = False,
+        sampler_workers: int = None,
+        sampler_chunk_size: int = None,
         **kwargs,
     ):
         """
@@ -65,9 +73,8 @@ class DataModule(L.LightningDataModule):
         Args:
             collection_name (str): The lamindb collection to be used.
-            organisms (list, optional): The organisms to include in the dataset. Defaults to ["NCBITaxon:9606"].
             weight_scaler (int, optional): how much more you will see the most present vs less present category.
-            train_oversampling_per_epoch (float, optional): The proportion of the dataset to include in the training set for each epoch. Defaults to 0.1.
+            n_samples_per_epoch (int, optional): The number of samples to include in the training set for each epoch. Defaults to 2_000_000.
             validation_split (float, optional): The proportion of the dataset to include in the validation split. Defaults to 0.2.
             test_split (float, optional): The proportion of the dataset to include in the test split. Defaults to 0.
                 it will use a full dataset and will round to the nearest dataset's cell count.
@@ -88,61 +95,38 @@ class DataModule(L.LightningDataModule):
             hierarchical_clss (list, optional): List of hierarchical classes. Defaults to [].
             metacell_mode (float, optional): The probability of using metacell mode. Defaults to 0.0.
             clss_to_predict (list, optional): List of classes to predict. Defaults to ["organism_ontology_term_id"].
-            modify_seed_on_requeue (bool, optional): Whether to modify the seed on requeue. Defaults to True.
             get_knn_cells (bool, optional): Whether to get the k-nearest neighbors of each queried cells. Defaults to False.
+            store_location (str, optional): The location to store the sampler indices. Defaults to None.
+            force_recompute_indices (bool, optional): Whether to force recompute the sampler indices. Defaults to False.
+            sampler_workers (int, optional): The number of workers to use for the sampler. Defaults to None (auto-determined).
+            sampler_chunk_size (int, optional): The size of the chunks to use for the sampler. Defaults to None (auto-determined).
             **kwargs: Additional keyword arguments passed to the pytorch DataLoader.
             see @file data.py and @file collator.py for more details about some of the parameters
         """
-        if collection_name is not None:
-            mdataset = Dataset(
-                ln.Collection.filter(name=collection_name).first(),
-                organisms=organisms,
-                clss_to_predict=clss_to_predict,
-                hierarchical_clss=hierarchical_clss,
-                metacell_mode=metacell_mode,
-                get_knn_cells=get_knn_cells,
+        if "organism_ontology_term_id" not in clss_to_predict:
+            raise ValueError(
+                "need 'organism_ontology_term_id' in the set of classes at least"
             )
+        mdataset = Dataset(
+            ln.Collection.filter(name=collection_name, is_latest=True).first(),
+            clss_to_predict=clss_to_predict,
+            hierarchical_clss=hierarchical_clss,
+            metacell_mode=metacell_mode,
+            get_knn_cells=get_knn_cells,
+            store_location=store_location,
+            force_recompute_indices=force_recompute_indices,
+        )
         # and location
         self.metacell_mode = bool(metacell_mode)
         self.gene_pos = None
         self.collection_name = collection_name
         if do_gene_pos:
-            if type(do_gene_pos) is str:
-                print("seeing a string: loading gene positions as biomart parquet file")
-                biomart = pd.read_parquet(do_gene_pos)
-            else:
-                # and annotations
-                if organisms != ["NCBITaxon:9606"]:
-                    raise ValueError(
-                        "need to provide your own table as this automated function only works for humans for now"
-                    )
-                biomart = getBiomartTable(
-                    attributes=["start_position", "chromosome_name"],
-                    useCache=True,
-                ).set_index("ensembl_gene_id")
-                biomart = biomart.loc[~biomart.index.duplicated(keep="first")]
-                biomart = biomart.sort_values(by=["chromosome_name", "start_position"])
-                c = []
-                i = 0
-                prev_position = -100000
-                prev_chromosome = None
-                for _, r in biomart.iterrows():
-                    if (
-                        r["chromosome_name"] != prev_chromosome
-                        or r["start_position"] - prev_position > gene_position_tolerance
-                    ):
-                        i += 1
-                    c.append(i)
-                    prev_position = r["start_position"]
-                    prev_chromosome = r["chromosome_name"]
-                print(f"reduced the size to {len(set(c)) / len(biomart)}")
-                biomart["pos"] = c
+            biomart = pd.read_parquet(do_gene_pos)
             mdataset.genedf = mdataset.genedf.join(biomart, how="inner")
             self.gene_pos = mdataset.genedf["pos"].astype(int).tolist()
         if gene_embeddings != "":
             mdataset.genedf = mdataset.genedf.join(
-                pd.read_parquet(gene_embeddings), how="inner"
+                pd.read_parquet(gene_embeddings).loc[:, :2], how="inner"
             )
             if do_gene_pos:
                 self.gene_pos = mdataset.genedf["pos"].tolist()
@@ -151,7 +135,7 @@ class DataModule(L.LightningDataModule):
         # we might want to not introduce zeros and
         if use_default_col:
             kwargs["collate_fn"] = Collator(
-                organisms=organisms,
+                organisms=mdataset.organisms,
                 how=how,
                 valid_genes=mdataset.genedf.index.tolist(),
                 max_len=max_len,
@@ -159,7 +143,7 @@ class DataModule(L.LightningDataModule):
                 org_to_id=mdataset.encoder[organism_name],
                 tp_name=tp_name,
                 organism_name=organism_name,
-                class_names=clss_to_predict,
+                class_names=list(self.classes.keys()),
             )
         self.validation_split = validation_split
         self.test_split = test_split
@@ -171,16 +155,19 @@ class DataModule(L.LightningDataModule):
         self.assays_to_drop = assays_to_drop
         self.n_samples = len(mdataset)
         self.weight_scaler = weight_scaler
-        self.train_oversampling_per_epoch = train_oversampling_per_epoch
+        self.n_samples_per_epoch = n_samples_per_epoch
         self.clss_to_weight = clss_to_weight
         self.train_weights = None
         self.train_labels = None
-        self.modify_seed_on_requeue = modify_seed_on_requeue
+        self.sampler_workers = sampler_workers
+        self.sampler_chunk_size = sampler_chunk_size
+        self.store_location = store_location
         self.nnz = None
-        self.restart_num = 0
         self.test_datasets = []
+        self.force_recompute_indices = force_recompute_indices
         self.test_idx = []
         super().__init__()
+        print("finished init")
     def __repr__(self):
         return (
@@ -190,7 +177,7 @@ class DataModule(L.LightningDataModule):
             f"\ttest_split={self.test_split},\n"
             f"\tn_samples={self.n_samples},\n"
             f"\tweight_scaler={self.weight_scaler},\n"
-            f"\ttrain_oversampling_per_epoch={self.train_oversampling_per_epoch},\n"
+            f"\tn_samples_per_epoch={self.n_samples_per_epoch},\n"
             f"\tassays_to_drop={self.assays_to_drop},\n"
             f"\tnum_datasets={len(self.dataset.mapped_dataset.storages)},\n"
             f"\ttest datasets={str(self.test_datasets)},\n"
@@ -242,6 +229,44 @@ class DataModule(L.LightningDataModule):
         """
         return self.dataset.genedf.index.tolist()
+    @genes.setter
+    def genes(self, genes):
+        self.dataset.genedf = self.dataset.genedf.loc[genes]
+        self.kwargs["collate_fn"].genes = genes
+        self.kwargs["collate_fn"]._setup(
+            genedf=self.dataset.genedf,
+            org_to_id=self.kwargs["collate_fn"].org_to_id,
+            valid_genes=genes,
+        )
+    @property
+    def encoders(self):
+        return self.dataset.encoder
+    @encoders.setter
+    def encoders(self, encoders):
+        self.dataset.encoder = encoders
+        self.kwargs["collate_fn"].org_to_id = encoders[
+            self.kwargs["collate_fn"].organism_name
+        ]
+        self.kwargs["collate_fn"]._setup(
+            org_to_id=self.kwargs["collate_fn"].org_to_id,
+            valid_genes=self.genes,
+        )
+    @property
+    def organisms(self):
+        return self.dataset.organisms
+    @organisms.setter
+    def organisms(self, organisms):
+        self.dataset.organisms = organisms
+        self.kwargs["collate_fn"].organisms = organisms
+        self.kwargs["collate_fn"]._setup(
+            org_to_id=self.kwargs["collate_fn"].org_to_id,
+            valid_genes=self.genes,
+        )
     @property
     def num_datasets(self):
         return len(self.dataset.mapped_dataset.storages)
@@ -256,106 +281,191 @@ class DataModule(L.LightningDataModule):
             It can be either 'fit' or 'test'. Defaults to None.
         """
         SCALE = 10
-        if "nnz" in self.clss_to_weight and self.weight_scaler > 0:
-            self.nnz = self.dataset.mapped_dataset.get_merged_labels("nnz")
-            self.clss_to_weight.remove("nnz")
-            (
-                (self.nnz.max() / SCALE)
-                / ((1 + self.nnz - self.nnz.min()) + (self.nnz.max() / SCALE))
-            ).min()
-        if len(self.clss_to_weight) > 0 and self.weight_scaler > 0:
-            weights, labels = self.dataset.get_label_weights(
-                self.clss_to_weight,
-                scaler=self.weight_scaler,
-                return_categories=True,
+        print("setting up the datamodule")
+        start_time = time.time()
+        if (
+            self.store_location is None
+            or not os.path.exists(
+                os.path.join(self.store_location, "train_weights.npy")
             )
-        else:
-            weights = np.ones(1)
-            labels = np.zeros(self.n_samples, dtype=int)
-        if isinstance(self.validation_split, int):
-            len_valid = self.validation_split
-        else:
-            len_valid = int(self.n_samples * self.validation_split)
-        if isinstance(self.test_split, int):
-            len_test = self.test_split
-        else:
-            len_test = int(self.n_samples * self.test_split)
-        assert (
-            len_test + len_valid < self.n_samples
-        ), "test set + valid set size is configured to be larger than entire dataset."
-        idx_full = []
-        if len(self.assays_to_drop) > 0:
-            badloc = np.isin(
-                self.dataset.mapped_dataset.get_merged_labels("assay_ontology_term_id"),
-                self.assays_to_drop,
-            )
-            idx_full = np.arange(len(labels))[~badloc]
-        else:
-            idx_full = np.arange(self.n_samples)
-        if len_test > 0:
-            # this way we work on some never seen datasets
-            # keeping at least one
-            len_test = (
-                len_test
-                if len_test > self.dataset.mapped_dataset.n_obs_list[0]
-                else self.dataset.mapped_dataset.n_obs_list[0]
+            or self.force_recompute_indices
+        ):
+            if "nnz" in self.clss_to_weight and self.weight_scaler > 0:
+                self.nnz = self.dataset.mapped_dataset.get_merged_labels(
+                    "nnz", is_cat=False
+                )
+                self.clss_to_weight.remove("nnz")
+                (
+                    (self.nnz.max() / SCALE)
+                    / ((1 + self.nnz - self.nnz.min()) + (self.nnz.max() / SCALE))
+                ).min()
+            if len(self.clss_to_weight) > 0 and self.weight_scaler > 0:
+                weights, labels = self.dataset.get_label_weights(
+                    self.clss_to_weight,
+                    scaler=self.weight_scaler,
+                    return_categories=True,
+                )
+            else:
+                weights = np.ones(1)
+                labels = np.zeros(self.n_samples, dtype=int)
+            if isinstance(self.validation_split, int):
+                len_valid = self.validation_split
+            else:
+                len_valid = int(self.n_samples * self.validation_split)
+            if isinstance(self.test_split, int):
+                len_test = self.test_split
+            else:
+                len_test = int(self.n_samples * self.test_split)
+            assert len_test + len_valid < self.n_samples, (
+                "test set + valid set size is configured to be larger than entire dataset."
             )
-            cs = 0
-            for i, c in enumerate(self.dataset.mapped_dataset.n_obs_list):
-                if cs + c > len_test:
-                    break
-                else:
-                    self.test_datasets.append(
-                        self.dataset.mapped_dataset.path_list[i].path
-                    )
-                    cs += c
-            len_test = cs
-            self.test_idx = idx_full[:len_test]
-            idx_full = idx_full[len_test:]
-        else:
-            self.test_idx = None
-        np.random.shuffle(idx_full)
-        if len_valid > 0:
-            self.valid_idx = idx_full[:len_valid].copy()
-            # store it for later
-            idx_full = idx_full[len_valid:]
-        else:
-            self.valid_idx = None
-        weights = np.concatenate([weights, np.zeros(1)])
-        labels[~np.isin(np.arange(self.n_samples), idx_full)] = len(weights) - 1
-        # some labels will now not exist anymore as replaced by len(weights) - 1.
-        # this means that the associated weights should be 0.
-        # by doing np.bincount(labels)*weights this will be taken into account
-        self.train_weights = weights
-        self.train_labels = labels
-        self.idx_full = idx_full
+            idx_full = []
+            if len(self.assays_to_drop) > 0:
+                badloc = np.isin(
+                    self.dataset.mapped_dataset.get_merged_labels(
+                        "assay_ontology_term_id"
+                    ),
+                    self.assays_to_drop,
+                )
+                idx_full = np.arange(len(labels))[~badloc]
+            else:
+                idx_full = np.arange(self.n_samples)
+            if len_test > 0:
+                # this way we work on some never seen datasets
+                # keeping at least one
+                len_test = (
+                    len_test
+                    if len_test > self.dataset.mapped_dataset.n_obs_list[0]
+                    else self.dataset.mapped_dataset.n_obs_list[0]
+                )
+                cs = 0
+                d_size = list(enumerate(self.dataset.mapped_dataset.n_obs_list))
+                random.Random(42).shuffle(d_size)  # always same order
+                for i, c in d_size:
+                    if cs + c > len_test:
+                        break
+                    else:
+                        self.test_datasets.append(
+                            self.dataset.mapped_dataset.path_list[i].path
+                        )
+                        cs += c
+                len_test = cs
+                self.test_idx = idx_full[:len_test]
+                idx_full = idx_full[len_test:]
+            else:
+                self.test_idx = None
+            np.random.shuffle(idx_full)
+            if len_valid > 0:
+                self.valid_idx = idx_full[:len_valid].copy()
+                # store it for later
+                idx_full = idx_full[len_valid:]
+            else:
+                self.valid_idx = None
+            weights = np.concatenate([weights, np.zeros(1)])
+            labels[~np.isin(np.arange(self.n_samples), idx_full)] = len(weights) - 1
+            # some labels will now not exist anymore as replaced by len(weights) - 1.
+            # this means that the associated weights should be 0.
+            # by doing np.bincount(labels)*weights this will be taken into account
+            self.train_weights = weights
+            self.train_labels = labels
+            self.idx_full = idx_full
+        if self.store_location is not None:
+            if (
+                not os.path.exists(
+                    os.path.join(self.store_location, "train_weights.npy")
+                )
+                or self.force_recompute_indices
+            ):
+                os.makedirs(self.store_location, exist_ok=True)
+                if self.nnz is not None:
+                    np.save(os.path.join(self.store_location, "nnz.npy"), self.nnz)
+                np.save(
+                    os.path.join(self.store_location, "train_weights.npy"),
+                    self.train_weights,
+                )
+                np.save(
+                    os.path.join(self.store_location, "train_labels.npy"),
+                    self.train_labels,
+                )
+                np.save(
+                    os.path.join(self.store_location, "idx_full.npy"), self.idx_full
+                )
+                if self.test_idx is not None:
+                    np.save(
+                        os.path.join(self.store_location, "test_idx.npy"), self.test_idx
+                    )
+                if self.valid_idx is not None:
+                    np.save(
+                        os.path.join(self.store_location, "valid_idx.npy"),
+                        self.valid_idx,
+                    )
+                listToFile(
+                    self.test_datasets,
+                    os.path.join(self.store_location, "test_datasets.txt"),
+                )
+            else:
+                self.nnz = (
+                    np.load(os.path.join(self.store_location, "nnz.npy"), mmap_mode="r")
+                    if os.path.exists(os.path.join(self.store_location, "nnz.npy"))
+                    else None
+                )
+                self.train_weights = np.load(
+                    os.path.join(self.store_location, "train_weights.npy")
+                )
+                self.train_labels = np.load(
+                    os.path.join(self.store_location, "train_labels.npy")
+                )
+                self.idx_full = np.load(
+                    os.path.join(self.store_location, "idx_full.npy"), mmap_mode="r"
+                )
+                self.test_idx = (
+                    np.load(os.path.join(self.store_location, "test_idx.npy"))
+                    if os.path.exists(os.path.join(self.store_location, "test_idx.npy"))
+                    else None
+                )
+                self.valid_idx = (
+                    np.load(os.path.join(self.store_location, "valid_idx.npy"))
+                    if os.path.exists(
+                        os.path.join(self.store_location, "valid_idx.npy")
+                    )
+                    else None
+                )
+                self.test_datasets = fileToList(
+                    os.path.join(self.store_location, "test_datasets.txt")
+                )
+                print("loaded from store")
+        print(f"done setup, took {time.time() - start_time:.2f} seconds")
         return self.test_datasets
     def train_dataloader(self, **kwargs):
-        # train_sampler = WeightedRandomSampler(
-        #    self.train_weights[self.train_labels],
-        #    int(self.n_samples*self.train_oversampling_per_epoch),
-        #    replacement=True,
-        # )
-        try:
-            train_sampler = LabelWeightedSampler(
-                label_weights=self.train_weights,
-                labels=self.train_labels,
-                num_samples=int(self.n_samples * self.train_oversampling_per_epoch),
-                element_weights=self.nnz,
-                replacement=self.replacement,
-                restart_num=self.restart_num,
-                modify_seed_on_requeue=self.modify_seed_on_requeue,
-            )
-        except ValueError as e:
-            raise ValueError(e + "have you run `datamodule.setup()`?")
+        if len(self.clss_to_weight) > 0 and self.weight_scaler > 0:
+            try:
+                print("Setting up the parallel train sampler...")
+                # Create the optimized parallel sampler
+                print(f"Using {self.sampler_workers} workers for class indexing")
+                train_sampler = LabelWeightedSampler(
+                    label_weights=self.train_weights,
+                    labels=self.train_labels,
+                    num_samples=int(self.n_samples_per_epoch),
+                    element_weights=self.nnz,
+                    replacement=self.replacement,
+                    n_workers=self.sampler_workers,
+                    chunk_size=self.sampler_chunk_size,
+                    store_location=self.store_location,
+                    force_recompute_indices=self.force_recompute_indices,
+                )
+            except ValueError as e:
+                raise ValueError(str(e) + " Have you run `datamodule.setup()`?")
+        else:
+            train_sampler = SubsetRandomSampler(self.idx_full)
+        current_loader_kwargs = kwargs.copy()
+        current_loader_kwargs.update(self.kwargs)
         return DataLoader(
             self.dataset,
             sampler=train_sampler,
-            **self.kwargs,
-            **kwargs,
+            **current_loader_kwargs,
         )
     def val_dataloader(self):
@@ -385,115 +495,335 @@ class DataModule(L.LightningDataModule):
             **self.kwargs,
         )
-    # def teardown(self):
-    # clean up state after the trainer stops, delete files...
-    # called on every process in DDP
-    # pass
 class LabelWeightedSampler(Sampler[int]):
-    label_weights: Sequence[float]
-    klass_indices: Sequence[Sequence[int]]
+    """
+    A weighted random sampler that samples from a dataset with respect t o both class weights and element weights.
+    This sampler is designed to handle very large datasets efficiently, with optimizations for:
+    1. Parallel building of class indices
+    2. Chunked processing for large arrays
+    3. Efficient memory management
+    4. Proper handling of replacement and non-replacement sampling
+    """
+    label_weights: torch.Tensor
+    klass_indices: dict[int, torch.Tensor]
     num_samples: int
-    nnz: Optional[Sequence[int]]
+    element_weights: Optional[torch.Tensor]
     replacement: bool
-    restart_num: int
-    modify_seed_on_requeue: bool
-    # when we use, just set weights for each classes(here is: np.ones(num_classes)), and labels of a dataset.
-    # this will result a class-balanced sampling, no matter how imbalance the labels are.
     def __init__(
         self,
         label_weights: Sequence[float],
-        labels: Sequence[int],
+        labels: np.ndarray,
         num_samples: int,
         replacement: bool = True,
-        element_weights: Sequence[float] = None,
-        restart_num: int = 0,
-        modify_seed_on_requeue: bool = True,
+        element_weights: Optional[Sequence[float]] = None,
+        n_workers: int = None,
+        chunk_size: int = None,  # Process 10M elements per chunk
+        store_location: str = None,
+        force_recompute_indices: bool = False,
     ) -> None:
         """
+        Initialize the sampler with parallel processing for large datasets.
-        :param label_weights: list(len=num_classes)[float], weights for each class.
-        :param labels: list(len=dataset_len)[int], labels of a dataset.
-        :param num_samples: number of samples.
-        :param restart_num: if we are continuing a previous run, we need to restart the sampler from the same point.
+        Args:
+            label_weights: Weights for each class (length = number of classes)
+            labels: Class label for each dataset element (length = dataset size)
+            num_samples: Number of samples to draw
+            replacement: Whether to sample with replacement
+            element_weights: Optional weights for each element within classes
+            n_workers: Number of parallel workers to use (default: number of CPUs-1)
+            chunk_size: Size of chunks to process in parallel (default: 10M elements)
         """
+        print("Initializing optimized parallel weighted sampler...")
         super(LabelWeightedSampler, self).__init__(None)
-        # reweight labels from counter otherwsie same weight to labels that have many elements vs a few
-        label_weights = np.array(label_weights) * np.bincount(labels)
-        self.label_weights = torch.as_tensor(label_weights, dtype=torch.float32)
-        self.labels = torch.as_tensor(labels, dtype=torch.int)
-        self.element_weights = (
-            torch.as_tensor(element_weights, dtype=torch.float32)
-            if element_weights is not None
-            else None
-        )
+        # Compute label weights (incorporating class frequencies)
+        # Directly use labels as numpy array without conversion
+        label_weights = np.asarray(label_weights) * np.bincount(labels)
+        self.label_weights = torch.as_tensor(
+            label_weights, dtype=torch.float32
+        ).share_memory_()
+        # Store element weights if provided
+        if element_weights is not None:
+            self.element_weights = torch.as_tensor(
+                element_weights, dtype=torch.float32
+            ).share_memory_()
+        else:
+            self.element_weights = None
         self.replacement = replacement
         self.num_samples = num_samples
-        self.restart_num = slurm_restart_count(use_mine=True) + restart_num
-        self.modify_seed_on_requeue = modify_seed_on_requeue
-        # list of tensor.
-        self.klass_indices = [
-            (self.labels == i_klass).nonzero().squeeze(1)
-            for i_klass in range(len(label_weights))
-        ]
-        self.klass_sizes = [len(klass_indices) for klass_indices in self.klass_indices]
+        if (
+            store_location is None
+            or not os.path.exists(os.path.join(store_location, "klass_indices.pt"))
+            or force_recompute_indices
+        ):
+            # Set number of workers (default to CPU count - 1, but at least 1)
+            if n_workers is None:
+                # Check if running on SLURM
+                n_workers = min(20, max(1, mp.cpu_count() - 1))
+                if "SLURM_CPUS_PER_TASK" in os.environ:
+                    n_workers = min(
+                        20, max(1, int(os.environ["SLURM_CPUS_PER_TASK"]) - 1)
+                    )
+            # Try to auto-determine optimal chunk size based on memory
+            if chunk_size is None:
+                try:
+                    import psutil
+                    # Check if running on SLURM
+                    available_memory = psutil.virtual_memory().available
+                    for name in [
+                        "SLURM_MEM_PER_NODE",
+                        "SLURM_MEM_PER_CPU",
+                        "SLURM_MEM_PER_GPU",
+                        "SLURM_MEM_PER_TASK",
+                    ]:
+                        if name in os.environ:
+                            available_memory = (
+                                int(os.environ[name]) * 1024 * 1024
+                            )  # Convert MB to bytes
+                            break
+                    # Use at most 50% of available memory across all workers
+                    memory_per_worker = 0.5 * available_memory / n_workers
+                    # Rough estimate: each label takes 4 bytes, each index 8 bytes
+                    bytes_per_element = 12
+                    chunk_size = min(
+                        max(100_000, int(memory_per_worker / bytes_per_element / 3)),
+                        2_000_000,
+                    )
+                    print(f"Auto-determined chunk size: {chunk_size:,} elements")
+                except (ImportError, KeyError):
+                    chunk_size = 2_000_000
+                    print(f"Using default chunk size: {chunk_size:,} elements")
+            # Parallelize the class indices building
+            print(f"Building class indices in parallel with {n_workers} workers...")
+            klass_indices = self._build_class_indices_parallel(
+                labels, chunk_size, n_workers
+            )
+            # Convert klass_indices to a single tensor and offset vector
+            all_indices = []
+            offsets = []
+            current_offset = 0
+            # Sort keys to ensure consistent ordering
+            keys = klass_indices.keys()
+            # Build concatenated tensor and track offsets
+            for i in range(max(keys) + 1):
+                offsets.append(current_offset)
+                if i in keys:
+                    indices = klass_indices[i]
+                    all_indices.append(indices)
+                    current_offset += len(indices)
+            # Convert to tensors
+            self.klass_indices = torch.cat(all_indices).to(torch.int32).share_memory_()
+            self.klass_offsets = torch.tensor(offsets, dtype=torch.long).share_memory_()
+        if store_location is not None:
+            store_path = os.path.join(store_location, "klass_indices.pt")
+            if os.path.exists(store_path) and not force_recompute_indices:
+                self.klass_indices = torch.load(store_path).share_memory_()
+                self.klass_offsets = torch.load(
+                    store_path.replace(".pt", "_offsets.pt")
+                ).share_memory_()
+                print(f"Loaded sampler indices from {store_path}")
+            else:
+                torch.save(self.klass_indices, store_path)
+                torch.save(self.klass_offsets, store_path.replace(".pt", "_offsets.pt"))
+                print(f"Saved sampler indices to {store_path}")
+        print(f"Done initializing sampler with {len(self.klass_offsets)} classes")
     def __iter__(self):
+        # Sample classes according to their weights
+        print("sampling a new batch of size", self.num_samples)
         sample_labels = torch.multinomial(
             self.label_weights,
             num_samples=self.num_samples,
             replacement=True,
-            generator=None
-            if self.restart_num == 0 and not self.modify_seed_on_requeue
-            else torch.Generator().manual_seed(self.restart_num),
         )
-        sample_indices = torch.empty_like(sample_labels)
-        for i_klass, klass_index in enumerate(self.klass_indices):
+        # Get counts of each class in sample_labels
+        unique_samples, sample_counts = torch.unique(sample_labels, return_counts=True)
+        # Initialize result tensor
+        result_indices_list = []  # Changed name to avoid conflict if you had result_indices elsewhere
+        # Process only the classes that were actually sampled
+        for i, (label, count) in tqdm(
+            enumerate(zip(unique_samples.tolist(), sample_counts.tolist())),
+            total=len(unique_samples),
+            desc="Processing classes in sampler",
+        ):
+            klass_index = self.klass_indices[
+                self.klass_offsets[label] : self.klass_offsets[label + 1]
+            ]
             if klass_index.numel() == 0:
                 continue
-            left_inds = (sample_labels == i_klass).nonzero().squeeze(1)
-            if len(left_inds) == 0:
-                continue
+            # Sample elements from this class
             if self.element_weights is not None:
-                right_inds = torch.multinomial(
-                    self.element_weights[klass_index],
-                    num_samples=len(klass_index)
-                    if not self.replacement and len(klass_index) < len(left_inds)
-                    else len(left_inds),
-                    replacement=self.replacement,
-                    generator=None
-                    if self.restart_num == 0 and not self.modify_seed_on_requeue
-                    else torch.Generator().manual_seed(self.restart_num),
-                )
+                # This is a critical point for memory
+                current_element_weights_slice = self.element_weights[klass_index]
+                if self.replacement:
+                    right_inds = torch.multinomial(
+                        current_element_weights_slice,
+                        num_samples=count,
+                        replacement=True,
+                    )
+                else:
+                    num_to_sample = min(count, len(klass_index))
+                    right_inds = torch.multinomial(
+                        current_element_weights_slice,
+                        num_samples=num_to_sample,
+                        replacement=False,
+                    )
             elif self.replacement:
-                right_inds = torch.randint(
-                    len(klass_index),
-                    size=(len(left_inds),),
-                    generator=None
-                    if self.restart_num == 0 and not self.modify_seed_on_requeue
-                    else torch.Generator().manual_seed(self.restart_num),
-                )
+                right_inds = torch.randint(len(klass_index), size=(count,))
             else:
-                maxelem = (
-                    len(left_inds)
-                    if len(left_inds) < len(klass_index)
-                    else len(klass_index)
-                )
-                right_inds = torch.randperm(len(klass_index))[:maxelem]
-            sample_indices[left_inds[: len(right_inds)]] = klass_index[right_inds]
-            if len(right_inds) < len(left_inds):
-                sample_indices[left_inds[len(right_inds) :]] = -1
-        # drop all -1
-        sample_indices = sample_indices[sample_indices != -1]
-        # torch shuffle
-        sample_indices = sample_indices[torch.randperm(len(sample_indices))]
-        self.num_samples = len(sample_indices)
-        # raise Exception("stop")
-        yield from iter(sample_indices.tolist())
+                num_to_sample = min(count, len(klass_index))
+                right_inds = torch.randperm(len(klass_index))[:num_to_sample]
+            # Get actual indices
+            sampled_indices = klass_index[right_inds]
+            result_indices_list.append(sampled_indices)
+        # Combine all indices
+        if result_indices_list:  # Check if the list is not empty
+            final_result_indices = torch.cat(
+                result_indices_list
+            )  # Use the list with the appended new name
+            # Shuffle the combined indices
+            shuffled_indices = final_result_indices[
+                torch.randperm(len(final_result_indices))
+            ]
+            self.num_samples = len(shuffled_indices)
+            yield from shuffled_indices.tolist()
+        else:
+            self.num_samples = 0
+            yield from iter([])
     def __len__(self):
         return self.num_samples
+    def _merge_chunk_results(self, results_list):
+        """Merge results from multiple chunks into a single dictionary.
+        Args:
+            results_list: list of dictionaries mapping class labels to index arrays
+        Returns:
+            merged dictionary with PyTorch tensors
+        """
+        merged = {}
+        # Collect all labels across all chunks
+        all_labels = set()
+        for chunk_result in results_list:
+            all_labels.update(chunk_result.keys())
+        # For each unique label
+        for label in all_labels:
+            # Collect indices from all chunks where this label appears
+            indices_lists = [
+                chunk_result[label]
+                for chunk_result in results_list
+                if label in chunk_result
+            ]
+            if indices_lists:
+                # Concatenate all indices for this label
+                merged[label] = torch.tensor(
+                    np.concatenate(indices_lists), dtype=torch.long
+                )
+            else:
+                merged[label] = torch.tensor([], dtype=torch.long)
+        return merged
+    def _build_class_indices_parallel(self, labels, chunk_size, n_workers=None):
+        """Build class indices in parallel across multiple workers.
+        Args:
+            labels: array of class labels
+            n_workers: number of parallel workers
+            chunk_size: size of chunks to process
+        Returns:
+            dictionary mapping class labels to tensors of indices
+        """
+        n = len(labels)
+        results = []
+        # Create chunks of the labels array with proper sizing
+        n_chunks = (n + chunk_size - 1) // chunk_size  # Ceiling division
+        print(f"Processing {n:,} elements in {n_chunks} chunks...")
+        # Process in chunks to limit memory usage
+        with ProcessPoolExecutor(
+            max_workers=n_workers, mp_context=mp.get_context("spawn")
+        ) as executor:
+            # Submit chunks for processing
+            futures = []
+            for i in range(n_chunks):
+                start_idx = i * chunk_size
+                end_idx = min((i + 1) * chunk_size, n)
+                # We pass only chunk boundaries, not the data itself
+                # This avoids unnecessary copies during process creation
+                futures.append(
+                    executor.submit(
+                        self._process_chunk_with_slice,
+                        (start_idx, end_idx, labels),
+                    )
+                )
+            # Collect results as they complete with progress reporting
+            for future in tqdm(
+                as_completed(futures), total=len(futures), desc="Processing chunks"
+            ):
+                results.append(future.result())
+        # Merge results from all chunks
+        print("Merging results from all chunks...")
+        merged_results = self._merge_chunk_results(results)
+        return merged_results
+    def _process_chunk_with_slice(self, slice_info):
+        """Process a slice of the labels array by indices.
+        Args:
+            slice_info: tuple of (start_idx, end_idx, labels_array) where
+                       start_idx and end_idx define the slice to process
+        Returns:
+            dict mapping class labels to arrays of indices
+        """
+        start_idx, end_idx, labels_array = slice_info
+        # We're processing a slice of the original array
+        labels_slice = labels_array[start_idx:end_idx]
+        chunk_indices = {}
+        # Create a direct map of indices
+        indices = np.arange(start_idx, end_idx)
+        # Get unique labels in this slice for more efficient processing
+        unique_labels = np.unique(labels_slice)
+        # For each valid label, find its indices
+        for label in unique_labels:
+            # Find positions where this label appears (using direct boolean indexing)
+            label_mask = labels_slice == label
+            chunk_indices[int(label)] = indices[label_mask]
+        return chunk_indices

scdataloader 1.9.1__py3-none-any.whl → 2.0.0__py3-none-any.whl

scdataloader 1.9.1py3-none-any.whl → 2.0.0py3-none-any.whl