PyPI - scdataloader - Versions diffs - 0.0.4__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

scdataloader 0.0.4py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

scdataloader/VERSION +1 -1
scdataloader/__main__.py +3 -0
scdataloader/collator.py +56 -31
scdataloader/config.py +6 -0
scdataloader/data.py +98 -87
scdataloader/datamodule.py +66 -38
scdataloader/mapped.py +266 -105
scdataloader/preprocess.py +3 -207
scdataloader/utils.py +57 -8
{scdataloader-0.0.4.dist-info → scdataloader-1.0.1.dist-info}/METADATA +45 -20
scdataloader-1.0.1.dist-info/RECORD +16 -0
scdataloader-0.0.4.dist-info/RECORD +0 -16
{scdataloader-0.0.4.dist-info → scdataloader-1.0.1.dist-info}/LICENSE +0 -0
{scdataloader-0.0.4.dist-info → scdataloader-1.0.1.dist-info}/WHEEL +0 -0
{scdataloader-0.0.4.dist-info → scdataloader-1.0.1.dist-info}/entry_points.txt +0 -0

scdataloader/datamodule.py CHANGED Viewed

@@ -6,6 +6,7 @@ from torch.utils.data.sampler import (
     WeightedRandomSampler,
     SubsetRandomSampler,
     SequentialSampler,
+    RandomSampler,
 )
 import torch
 from torch.utils.data import DataLoader, Sampler
@@ -22,7 +23,7 @@ class DataModule(L.LightningDataModule):
     def __init__(
         self,
         collection_name: str,
-        label_to_weight: list = ["organism_ontology_term_id"],
+        clss_to_weight: list = ["organism_ontology_term_id"],
         organisms: list = ["NCBITaxon:9606"],
         weight_scaler: int = 10,
         train_oversampling_per_epoch: float = 0.1,
@@ -32,9 +33,9 @@ class DataModule(L.LightningDataModule):
         use_default_col: bool = True,
         gene_position_tolerance: int = 10_000,
         # this is for the mappedCollection
-        label_to_pred: list = ["organism_ontology_term_id"],
-        all_labels: list = ["organism_ontology_term_id"],
-        hierarchical_labels: list = [],
+        clss_to_pred: list = ["organism_ontology_term_id"],
+        all_clss: list = ["organism_ontology_term_id"],
+        hierarchical_clss: list = [],
         # this is for the collator
         how: str = "random expr",
         organism_name: str = "organism_ontology_term_id",
@@ -59,36 +60,55 @@ class DataModule(L.LightningDataModule):
         Args:
             collection_name (str): The lamindb collection to be used.
-            weight_scaler (int, optional): how much more you will see the most present vs less present category.
-            gene_position_tolerance (int, optional): The tolerance for gene position. Defaults to 10_000.
-                any genes within this distance of each other will be considered at the same position.
-            gene_embeddings (str, optional): The path to the gene embeddings file. Defaults to "".
-                the file must have ensembl_gene_id as index.
-                This is used to subset the available genes further to the ones that have embeddings in your model.
+            clss_to_weight (list, optional): The classes to weight in the trainer's weighted random sampler. Defaults to ["organism_ontology_term_id"].
             organisms (list, optional): The organisms to include in the dataset. Defaults to ["NCBITaxon:9606"].
-            label_to_weight (list, optional): List of labels to weight in the trainer's weighted random sampler. Defaults to [].
+            weight_scaler (int, optional): how much more you will see the most present vs less present category.
+            train_oversampling_per_epoch (float, optional): The proportion of the dataset to include in the training set for each epoch. Defaults to 0.1.
             validation_split (float, optional): The proportion of the dataset to include in the validation split. Defaults to 0.2.
             test_split (float, optional): The proportion of the dataset to include in the test split. Defaults to 0.
                 it will use a full dataset and will round to the nearest dataset's cell count.
-            **other args: see @file data.py and @file collator.py for more details
+            gene_embeddings (str, optional): The path to the gene embeddings file. Defaults to "".
+                the file must have ensembl_gene_id as index.
+                This is used to subset the available genes further to the ones that have embeddings in your model.
+            use_default_col (bool, optional): Whether to use the default collator. Defaults to True.
+            gene_position_tolerance (int, optional): The tolerance for gene position. Defaults to 10_000.
+                any genes within this distance of each other will be considered at the same position.
+            clss_to_weight (list, optional): List of labels to weight in the trainer's weighted random sampler. Defaults to [].
+            assays_to_drop (list, optional): List of assays to drop from the dataset. Defaults to [].
+            do_gene_pos (Union[bool, str], optional): Whether to use gene positions. Defaults to True.
+            max_len (int, optional): The maximum length of the input tensor. Defaults to 1000.
+            add_zero_genes (int, optional): The number of zero genes to add to the input tensor. Defaults to 100.
+            how (str, optional): The method to use for the collator. Defaults to "random expr".
+            organism_name (str, optional): The name of the organism. Defaults to "organism_ontology_term_id".
+            tp_name (Optional[str], optional): The name of the timepoint. Defaults to None.
+            hierarchical_clss (list, optional): List of hierarchical classes. Defaults to [].
+            all_clss (list, optional): List of all classes. Defaults to ["organism_ontology_term_id"].
+            clss_to_pred (list, optional): List of classes to predict. Defaults to ["organism_ontology_term_id"].
             **kwargs: Additional keyword arguments passed to the pytorch DataLoader.
+            see @file data.py and @file collator.py for more details about some of the parameters
         """
         if collection_name is not None:
             mdataset = Dataset(
                 ln.Collection.filter(name=collection_name).first(),
                 organisms=organisms,
-                obs=all_labels,
-                clss_to_pred=label_to_pred,
-                hierarchical_clss=hierarchical_labels,
+                obs=all_clss,
+                clss_to_pred=clss_to_pred,
+                hierarchical_clss=hierarchical_clss,
             )
-            print(mdataset)
+            # print(mdataset)
         # and location
+        self.gene_pos = None
         if do_gene_pos:
             if type(do_gene_pos) is str:
                 print("seeing a string: loading gene positions as biomart parquet file")
                 biomart = pd.read_parquet(do_gene_pos)
             else:
                 # and annotations
+                if organisms != ["NCBITaxon:9606"]:
+                    raise ValueError(
+                        "need to provide your own table as this automated function only works for humans for now"
+                    )
                 biomart = getBiomartTable(
                     attributes=["start_position", "chromosome_name"]
                 ).set_index("ensembl_gene_id")
@@ -118,7 +138,7 @@ class DataModule(L.LightningDataModule):
             )
             if do_gene_pos:
                 self.gene_pos = mdataset.genedf["pos"].tolist()
-        self.labels = {k: len(v) for k, v in mdataset.class_topred.items()}
+        self.classes = {k: len(v) for k, v in mdataset.class_topred.items()}
         # we might want not to order the genes by expression (or do it?)
         # we might want to not introduce zeros and
         if use_default_col:
@@ -131,19 +151,23 @@ class DataModule(L.LightningDataModule):
                 org_to_id=mdataset.encoder[organism_name],
                 tp_name=tp_name,
                 organism_name=organism_name,
-                class_names=label_to_weight,
+                class_names=clss_to_weight,
             )
         self.validation_split = validation_split
         self.test_split = test_split
         self.dataset = mdataset
         self.kwargs = kwargs
+        if "sampler" in self.kwargs:
+            self.kwargs.pop("sampler")
         self.assays_to_drop = assays_to_drop
         self.n_samples = len(mdataset)
         self.weight_scaler = weight_scaler
         self.train_oversampling_per_epoch = train_oversampling_per_epoch
-        self.label_to_weight = label_to_weight
+        self.clss_to_weight = clss_to_weight
         self.train_weights = None
         self.train_labels = None
+        self.test_datasets = []
+        self.test_idx = []
         super().__init__()
     def __repr__(self):
@@ -154,8 +178,12 @@ class DataModule(L.LightningDataModule):
             f"\ttest_split={self.test_split},\n"
             f"\tn_samples={self.n_samples},\n"
             f"\tweight_scaler={self.weight_scaler},\n"
-            f"\train_oversampling_per_epoch={self.train_oversampling_per_epoch},\n"
-            f"\tlabel_to_weight={self.label_to_weight}\n"
+            f"\ttrain_oversampling_per_epoch={self.train_oversampling_per_epoch},\n"
+            f"\tassays_to_drop={self.assays_to_drop},\n"
+            f"\tnum_datasets={len(self.dataset.mapped_dataset.storages)},\n"
+            f"\ttest datasets={str(self.test_datasets)},\n"
+            f"perc test: {str(len(self.test_idx) / self.n_samples)},\n"
+            f"\tclss_to_weight={self.clss_to_weight}\n"
             + (
                 "\twith train_dataset size of=("
                 + str((self.train_weights != 0).sum())
@@ -179,22 +207,22 @@ class DataModule(L.LightningDataModule):
         return decoders
     @property
-    def cls_hierarchy(self):
+    def labels_hierarchy(self):
         """
-        cls_hierarchy the hierarchy of labels for any cls that would have a hierarchy
+        labels_hierarchy the hierarchy of labels for any cls that would have a hierarchy
         Returns:
             dict[str, dict[str, str]]
         """
-        cls_hierarchy = {}
-        for k, dic in self.dataset.class_groupings.items():
+        labels_hierarchy = {}
+        for k, dic in self.dataset.labels_groupings.items():
             rdic = {}
             for sk, v in dic.items():
                 rdic[self.dataset.encoder[k][sk]] = [
                     self.dataset.encoder[k][i] for i in list(v)
                 ]
-            cls_hierarchy[k] = rdic
-        return cls_hierarchy
+            labels_hierarchy[k] = rdic
+        return labels_hierarchy
     @property
     def genes(self):
@@ -219,9 +247,9 @@ class DataModule(L.LightningDataModule):
             stage (str, optional): The stage of the model training process.
             It can be either 'fit' or 'test'. Defaults to None.
         """
-        if len(self.label_to_weight) > 0:
+        if len(self.clss_to_weight) > 0 and self.weight_scaler > 0:
             weights, labels = self.dataset.get_label_weights(
-                self.label_to_weight, scaler=self.weight_scaler
+                self.clss_to_weight, scaler=self.weight_scaler
             )
         else:
             weights = np.ones(1)
@@ -248,7 +276,6 @@ class DataModule(L.LightningDataModule):
             idx_full = np.array(idx_full)
         else:
             idx_full = np.arange(self.n_samples)
-        test_datasets = []
         if len_test > 0:
             # this way we work on some never seen datasets
             # keeping at least one
@@ -258,17 +285,15 @@ class DataModule(L.LightningDataModule):
                 else self.dataset.mapped_dataset.n_obs_list[0]
             )
             cs = 0
-            print("these files will be considered test datasets:")
             for i, c in enumerate(self.dataset.mapped_dataset.n_obs_list):
                 if cs + c > len_test:
                     break
                 else:
-                    print("    " + self.dataset.mapped_dataset.path_list[i].path)
-                    test_datasets.append(self.dataset.mapped_dataset.path_list[i].path)
+                    self.test_datasets.append(
+                        self.dataset.mapped_dataset._path_list[i].path
+                    )
                     cs += c
             len_test = cs
-            print("perc test: ", len_test / self.n_samples)
             self.test_idx = idx_full[:len_test]
             idx_full = idx_full[len_test:]
         else:
@@ -286,8 +311,7 @@ class DataModule(L.LightningDataModule):
         self.train_weights = weights
         self.train_labels = labels
         self.idx_full = idx_full
-        return test_datasets
+        return self.test_datasets
     def train_dataloader(self, **kwargs):
         # train_sampler = WeightedRandomSampler(
@@ -299,7 +323,6 @@ class DataModule(L.LightningDataModule):
             self.train_weights,
             self.train_labels,
             num_samples=int(self.n_samples * self.train_oversampling_per_epoch),
-            # replacement=True,
         )
         return DataLoader(self.dataset, sampler=train_sampler, **self.kwargs, **kwargs)
@@ -321,6 +344,11 @@ class DataModule(L.LightningDataModule):
             else None
         )
+    def predict_dataloader(self):
+        return DataLoader(
+            self.dataset, sampler=SubsetRandomSampler(self.idx_full), **self.kwargs
+        )
     # def teardown(self):
     # clean up state after the trainer stops, delete files...
     # called on every process in DDP

scdataloader 0.0.4__py3-none-any.whl → 1.0.1__py3-none-any.whl

scdataloader 0.0.4py3-none-any.whl → 1.0.1py3-none-any.whl