PyPI - scdataloader - Versions diffs - 1.6.4__py3-none-any.whl → 1.8.0__py3-none-any.whl - Mend

scdataloader 1.6.4py3-none-any.whl → 1.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

scdataloader/VERSION +1 -1
scdataloader/__init__.py +2 -0
scdataloader/__main__.py +98 -36
scdataloader/collator.py +13 -7
scdataloader/config.py +99 -0
scdataloader/data.py +48 -35
scdataloader/datamodule.py +138 -44
scdataloader/mapped.py +656 -0
scdataloader/preprocess.py +239 -91
scdataloader/utils.py +71 -27
{scdataloader-1.6.4.dist-info → scdataloader-1.8.0.dist-info}/METADATA +10 -8
scdataloader-1.8.0.dist-info/RECORD +16 -0
{scdataloader-1.6.4.dist-info → scdataloader-1.8.0.dist-info}/WHEEL +1 -1
scdataloader-1.8.0.dist-info/entry_points.txt +2 -0
scdataloader-1.6.4.dist-info/RECORD +0 -14
{scdataloader-1.6.4.dist-info → scdataloader-1.8.0.dist-info}/licenses/LICENSE +0 -0

scdataloader/datamodule.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import os
 from typing import Optional, Sequence, Union
 import lamindb as ln
@@ -15,7 +16,9 @@ from torch.utils.data.sampler import (
 from .collator import Collator
 from .data import Dataset
-from .utils import getBiomartTable
+from .utils import getBiomartTable, slurm_restart_count
+FILE_DIR = os.path.dirname(os.path.abspath(__file__))
 class DataModule(L.LightningDataModule):
@@ -32,22 +35,24 @@ class DataModule(L.LightningDataModule):
         use_default_col: bool = True,
         gene_position_tolerance: int = 10_000,
         # this is for the mappedCollection
-        clss_to_pred: list = ["organism_ontology_term_id"],
-        all_clss: list = ["organism_ontology_term_id"],
+        clss_to_predict: list = ["organism_ontology_term_id"],
         hierarchical_clss: list = [],
         # this is for the collator
         how: str = "random expr",
         organism_name: str = "organism_ontology_term_id",
         max_len: int = 1000,
         add_zero_genes: int = 100,
+        replacement: bool = True,
         do_gene_pos: Union[bool, str] = True,
         tp_name: Optional[str] = None,  # "heat_diff"
         assays_to_drop: list = [
-            "EFO:0008853",
-            "EFO:0010961",
-            "EFO:0030007",
-            "EFO:0030062",
+            # "EFO:0008853", #patch seq
+            # "EFO:0010961", # visium
+            "EFO:0030007",  # ATACseq
+            # "EFO:0030062", # slide-seq
         ],
+        metacell_mode: float = 0.0,
+        modify_seed_on_requeue: bool = True,
         **kwargs,
     ):
         """
@@ -59,7 +64,6 @@ class DataModule(L.LightningDataModule):
         Args:
             collection_name (str): The lamindb collection to be used.
-            clss_to_weight (list, optional): The classes to weight in the trainer's weighted random sampler. Defaults to ["organism_ontology_term_id"].
             organisms (list, optional): The organisms to include in the dataset. Defaults to ["NCBITaxon:9606"].
             weight_scaler (int, optional): how much more you will see the most present vs less present category.
             train_oversampling_per_epoch (float, optional): The proportion of the dataset to include in the training set for each epoch. Defaults to 0.1.
@@ -81,23 +85,24 @@ class DataModule(L.LightningDataModule):
             organism_name (str, optional): The name of the organism. Defaults to "organism_ontology_term_id".
             tp_name (Optional[str], optional): The name of the timepoint. Defaults to None.
             hierarchical_clss (list, optional): List of hierarchical classes. Defaults to [].
-            all_clss (list, optional): List of all classes. Defaults to ["organism_ontology_term_id"].
-            clss_to_pred (list, optional): List of classes to predict. Defaults to ["organism_ontology_term_id"].
+            metacell_mode (float, optional): The probability of using metacell mode. Defaults to 0.0.
+            clss_to_predict (list, optional): List of classes to predict. Defaults to ["organism_ontology_term_id"].
+            modify_seed_on_requeue (bool, optional): Whether to modify the seed on requeue. Defaults to True.
             **kwargs: Additional keyword arguments passed to the pytorch DataLoader.
             see @file data.py and @file collator.py for more details about some of the parameters
         """
         if collection_name is not None:
             mdataset = Dataset(
                 ln.Collection.filter(name=collection_name).first(),
                 organisms=organisms,
-                obs=all_clss,
-                clss_to_pred=clss_to_pred,
+                clss_to_predict=clss_to_predict,
                 hierarchical_clss=hierarchical_clss,
+                metacell_mode=metacell_mode,
             )
-            # print(mdataset)
         # and location
+        self.metacell_mode = bool(metacell_mode)
         self.gene_pos = None
+        self.collection_name = collection_name
         if do_gene_pos:
             if type(do_gene_pos) is str:
                 print("seeing a string: loading gene positions as biomart parquet file")
@@ -127,7 +132,7 @@ class DataModule(L.LightningDataModule):
                     c.append(i)
                     prev_position = r["start_position"]
                     prev_chromosome = r["chromosome_name"]
-                print(f"reduced the size to {len(set(c))/len(biomart)}")
+                print(f"reduced the size to {len(set(c)) / len(biomart)}")
                 biomart["pos"] = c
             mdataset.genedf = mdataset.genedf.join(biomart, how="inner")
             self.gene_pos = mdataset.genedf["pos"].astype(int).tolist()
@@ -151,11 +156,13 @@ class DataModule(L.LightningDataModule):
                 org_to_id=mdataset.encoder[organism_name],
                 tp_name=tp_name,
                 organism_name=organism_name,
-                class_names=clss_to_weight,
+                class_names=clss_to_predict,
+                metacell_mode=bool(metacell_mode),
             )
         self.validation_split = validation_split
         self.test_split = test_split
         self.dataset = mdataset
+        self.replacement = replacement
         self.kwargs = kwargs
         if "sampler" in self.kwargs:
             self.kwargs.pop("sampler")
@@ -166,6 +173,9 @@ class DataModule(L.LightningDataModule):
         self.clss_to_weight = clss_to_weight
         self.train_weights = None
         self.train_labels = None
+        self.modify_seed_on_requeue = modify_seed_on_requeue
+        self.nnz = None
+        self.restart_num = 0
         self.test_datasets = []
         self.test_idx = []
         super().__init__()
@@ -184,12 +194,8 @@ class DataModule(L.LightningDataModule):
             f"\ttest datasets={str(self.test_datasets)},\n"
             f"perc test: {str(len(self.test_idx) / self.n_samples)},\n"
             f"\tclss_to_weight={self.clss_to_weight}\n"
-            + (
-                "\twith train_dataset size of=("
-                + str((self.train_weights != 0).sum())
-                + ")\n)"
-            )
-            if self.train_weights is not None
+            + ("\twith train_dataset size of=(" + str(len(self.idx_full)) + ")\n)")
+            if self.idx_full is not None
             else ")"
         )
@@ -247,13 +253,23 @@ class DataModule(L.LightningDataModule):
             stage (str, optional): The stage of the model training process.
             It can be either 'fit' or 'test'. Defaults to None.
         """
+        SCALE = 10
+        if "nnz" in self.clss_to_weight and self.weight_scaler > 0:
+            self.nnz = self.dataset.mapped_dataset.get_merged_labels("nnz")
+            self.clss_to_weight.remove("nnz")
+            (
+                (self.nnz.max() / SCALE)
+                / ((1 + self.nnz - self.nnz.min()) + (self.nnz.max() / SCALE))
+            ).min()
         if len(self.clss_to_weight) > 0 and self.weight_scaler > 0:
             weights, labels = self.dataset.get_label_weights(
-                self.clss_to_weight, scaler=self.weight_scaler
+                self.clss_to_weight,
+                scaler=self.weight_scaler,
+                return_categories=True,
             )
         else:
             weights = np.ones(1)
-            labels = np.zeros(self.n_samples)
+            labels = np.zeros(self.n_samples, dtype=int)
         if isinstance(self.validation_split, int):
             len_valid = self.validation_split
         else:
@@ -268,12 +284,11 @@ class DataModule(L.LightningDataModule):
         idx_full = []
         if len(self.assays_to_drop) > 0:
-            for i, a in enumerate(
-                self.dataset.mapped_dataset.get_merged_labels("assay_ontology_term_id")
-            ):
-                if a not in self.assays_to_drop:
-                    idx_full.append(i)
-            idx_full = np.array(idx_full)
+            badloc = np.isin(
+                self.dataset.mapped_dataset.get_merged_labels("assay_ontology_term_id"),
+                self.assays_to_drop,
+            )
+            idx_full = np.arange(len(labels))[~badloc]
         else:
             idx_full = np.arange(self.n_samples)
         if len_test > 0:
@@ -302,12 +317,15 @@ class DataModule(L.LightningDataModule):
         np.random.shuffle(idx_full)
         if len_valid > 0:
             self.valid_idx = idx_full[:len_valid].copy()
+            # store it for later
             idx_full = idx_full[len_valid:]
         else:
             self.valid_idx = None
         weights = np.concatenate([weights, np.zeros(1)])
         labels[~np.isin(np.arange(self.n_samples), idx_full)] = len(weights) - 1
+        # some labels will now not exist anymore as replaced by len(weights) - 1.
+        # this means that the associated weights should be 0.
+        # by doing np.bincount(labels)*weights this will be taken into account
         self.train_weights = weights
         self.train_labels = labels
         self.idx_full = idx_full
@@ -319,17 +337,31 @@ class DataModule(L.LightningDataModule):
         #    int(self.n_samples*self.train_oversampling_per_epoch),
         #    replacement=True,
         # )
-        train_sampler = LabelWeightedSampler(
-            self.train_weights,
-            self.train_labels,
-            num_samples=int(self.n_samples * self.train_oversampling_per_epoch),
+        try:
+            train_sampler = LabelWeightedSampler(
+                label_weights=self.train_weights,
+                labels=self.train_labels,
+                num_samples=int(self.n_samples * self.train_oversampling_per_epoch),
+                element_weights=self.nnz,
+                replacement=self.replacement,
+                restart_num=self.restart_num,
+                modify_seed_on_requeue=self.modify_seed_on_requeue,
+            )
+        except ValueError as e:
+            raise ValueError(e + "have you run `datamodule.setup()`?")
+        return DataLoader(
+            self.dataset,
+            sampler=train_sampler,
+            **self.kwargs,
+            **kwargs,
         )
-        return DataLoader(self.dataset, sampler=train_sampler, **self.kwargs, **kwargs)
     def val_dataloader(self):
         return (
             DataLoader(
-                self.dataset, sampler=SubsetRandomSampler(self.valid_idx), **self.kwargs
+                self.dataset,
+                sampler=SubsetRandomSampler(self.valid_idx),
+                **self.kwargs,
             )
             if self.valid_idx is not None
             else None
@@ -346,7 +378,9 @@ class DataModule(L.LightningDataModule):
     def predict_dataloader(self):
         return DataLoader(
-            self.dataset, sampler=SubsetRandomSampler(self.idx_full), **self.kwargs
+            self.dataset,
+            sampler=SubsetRandomSampler(self.idx_full),
+            **self.kwargs,
         )
     # def teardown(self):
@@ -359,18 +393,29 @@ class LabelWeightedSampler(Sampler[int]):
     label_weights: Sequence[float]
     klass_indices: Sequence[Sequence[int]]
     num_samples: int
+    nnz: Optional[Sequence[int]]
+    replacement: bool
+    restart_num: int
+    modify_seed_on_requeue: bool
     # when we use, just set weights for each classes(here is: np.ones(num_classes)), and labels of a dataset.
     # this will result a class-balanced sampling, no matter how imbalance the labels are.
-    # NOTE: here we use replacement=True, you can change it if you don't upsample a class.
     def __init__(
-        self, label_weights: Sequence[float], labels: Sequence[int], num_samples: int
+        self,
+        label_weights: Sequence[float],
+        labels: Sequence[int],
+        num_samples: int,
+        replacement: bool = True,
+        element_weights: Sequence[float] = None,
+        restart_num: int = 0,
+        modify_seed_on_requeue: bool = True,
     ) -> None:
         """
         :param label_weights: list(len=num_classes)[float], weights for each class.
         :param labels: list(len=dataset_len)[int], labels of a dataset.
         :param num_samples: number of samples.
+        :param restart_num: if we are continuing a previous run, we need to restart the sampler from the same point.
         """
         super(LabelWeightedSampler, self).__init__(None)
@@ -379,24 +424,73 @@ class LabelWeightedSampler(Sampler[int]):
         self.label_weights = torch.as_tensor(label_weights, dtype=torch.float32)
         self.labels = torch.as_tensor(labels, dtype=torch.int)
+        self.element_weights = (
+            torch.as_tensor(element_weights, dtype=torch.float32)
+            if element_weights is not None
+            else None
+        )
+        self.replacement = replacement
         self.num_samples = num_samples
+        self.restart_num = slurm_restart_count(use_mine=True) + restart_num
+        self.modify_seed_on_requeue = modify_seed_on_requeue
         # list of tensor.
         self.klass_indices = [
             (self.labels == i_klass).nonzero().squeeze(1)
             for i_klass in range(len(label_weights))
         ]
+        self.klass_sizes = [len(klass_indices) for klass_indices in self.klass_indices]
     def __iter__(self):
         sample_labels = torch.multinomial(
-            self.label_weights, num_samples=self.num_samples, replacement=True
+            self.label_weights,
+            num_samples=self.num_samples,
+            replacement=True,
+            generator=None
+            if self.restart_num == 0 and not self.modify_seed_on_requeue
+            else torch.Generator().manual_seed(self.restart_num),
         )
         sample_indices = torch.empty_like(sample_labels)
         for i_klass, klass_index in enumerate(self.klass_indices):
             if klass_index.numel() == 0:
                 continue
             left_inds = (sample_labels == i_klass).nonzero().squeeze(1)
-            right_inds = torch.randint(len(klass_index), size=(len(left_inds),))
-            sample_indices[left_inds] = klass_index[right_inds]
+            if len(left_inds) == 0:
+                continue
+            if self.element_weights is not None:
+                right_inds = torch.multinomial(
+                    self.element_weights[klass_index],
+                    num_samples=len(klass_index)
+                    if not self.replacement and len(klass_index) < len(left_inds)
+                    else len(left_inds),
+                    replacement=self.replacement,
+                    generator=None
+                    if self.restart_num == 0 and not self.modify_seed_on_requeue
+                    else torch.Generator().manual_seed(self.restart_num),
+                )
+            elif self.replacement:
+                right_inds = torch.randint(
+                    len(klass_index),
+                    size=(len(left_inds),),
+                    generator=None
+                    if self.restart_num == 0 and not self.modify_seed_on_requeue
+                    else torch.Generator().manual_seed(self.restart_num),
+                )
+            else:
+                maxelem = (
+                    len(left_inds)
+                    if len(left_inds) < len(klass_index)
+                    else len(klass_index)
+                )
+                right_inds = torch.randperm(len(klass_index))[:maxelem]
+            sample_indices[left_inds[: len(right_inds)]] = klass_index[right_inds]
+            if len(right_inds) < len(left_inds):
+                sample_indices[left_inds[len(right_inds) :]] = -1
+        # drop all -1
+        sample_indices = sample_indices[sample_indices != -1]
+        # torch shuffle
+        sample_indices = sample_indices[torch.randperm(len(sample_indices))]
+        self.num_samples = len(sample_indices)
+        # raise Exception("stop")
         yield from iter(sample_indices.tolist())
     def __len__(self):

scdataloader 1.6.4__py3-none-any.whl → 1.8.0__py3-none-any.whl

scdataloader 1.6.4py3-none-any.whl → 1.8.0py3-none-any.whl