PyPI - scdataloader - Versions diffs - 1.9.0__tar.gz → 1.9.2__tar.gz - Mend

scdataloader 1.9.0tar.gz → 1.9.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

{scdataloader-1.9.0 → scdataloader-1.9.2}/.gitignore RENAMED Viewed

@@ -135,3 +135,5 @@ figures/*/*.png
 figures/*.png
 figures/add_postp_clust.py
 figures/age_relabel.py
+notebooks/figures/umap_*.png
+notebooks/data/

{scdataloader-1.9.0 → scdataloader-1.9.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: scdataloader
-Version: 1.9.0
+Version: 1.9.2
 Summary: a dataloader for single cell data in lamindb
 Project-URL: repository, https://github.com/jkobject/scDataLoader
 Author-email: jkobject <jkobject@gmail.com>
@@ -15,7 +15,7 @@ Requires-Dist: django>=4.0.0
 Requires-Dist: harmonypy>=0.0.10
 Requires-Dist: ipykernel>=6.20.0
 Requires-Dist: jupytext>=1.16.0
-Requires-Dist: lamindb[bionty,cellregistry,jupyter,ourprojects,zarr]<2,>=1.0.4
+Requires-Dist: lamindb[bionty,cellregistry,jupyter,zarr]<2,>=1.0.4
 Requires-Dist: leidenalg>=0.8.0
 Requires-Dist: matplotlib>=3.5.0
 Requires-Dist: numpy==1.26.0
@@ -71,7 +71,16 @@ It allows you to:
 3. create a more complex single cell dataset
 4. extend it to your need
-built on top of `lamindb` and the `.mapped()` function by Sergei: https://github.com/Koncopd
+built on top of `lamindb` and the `.mapped()` function by Sergei: https://github.com/Koncopd
+```
+Portions of the mapped.py file are derived from Lamin Labs
+Copyright 2024 Lamin Labs
+Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
+The rest of the package is licensed under MIT License, see LICENSE for details
+Please see https://github.com/laminlabs/lamindb/blob/main/lamindb/core/_mapped_collection.py
+for the original implementation
+```
 The package has been designed together with the [scPRINT paper](https://doi.org/10.1101/2024.07.29.605556) and [model](https://github.com/cantinilab/scPRINT).

{scdataloader-1.9.0 → scdataloader-1.9.2}/README.md RENAMED Viewed

@@ -28,7 +28,16 @@ It allows you to:
 3. create a more complex single cell dataset
 4. extend it to your need
-built on top of `lamindb` and the `.mapped()` function by Sergei: https://github.com/Koncopd
+built on top of `lamindb` and the `.mapped()` function by Sergei: https://github.com/Koncopd
+```
+Portions of the mapped.py file are derived from Lamin Labs
+Copyright 2024 Lamin Labs
+Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
+The rest of the package is licensed under MIT License, see LICENSE for details
+Please see https://github.com/laminlabs/lamindb/blob/main/lamindb/core/_mapped_collection.py
+for the original implementation
+```
 The package has been designed together with the [scPRINT paper](https://doi.org/10.1101/2024.07.29.605556) and [model](https://github.com/cantinilab/scPRINT).

{scdataloader-1.9.0 → scdataloader-1.9.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "scdataloader"
-version = "1.9.0"
+version = "1.9.2"
 description = "a dataloader for single cell data in lamindb"
 authors = [
     {name = "jkobject", email = "jkobject@gmail.com"}
@@ -11,7 +11,7 @@ requires-python = ">=3.10,<3.14"
 keywords = ["scRNAseq", "dataloader", "pytorch", "lamindb", "scPRINT"]
 dependencies = [
     "numpy==1.26.0",
-    "lamindb[bionty,ourprojects,jupyter,cellregistry,zarr]>=1.0.4,<2",
+    "lamindb[bionty,jupyter,cellregistry,zarr]>=1.0.4,<2",
     "cellxgene-census>=0.1.0",
     "torch==2.2.0",
     "pytorch-lightning>=2.3.0",

scdataloader-1.9.2/scdataloader/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 1.9.2

{scdataloader-1.9.0 → scdataloader-1.9.2}/scdataloader/__init__.py RENAMED Viewed

@@ -1,7 +1,8 @@
+from importlib.metadata import version
 from .collator import Collator
 from .data import Dataset, SimpleAnnDataset
 from .datamodule import DataModule
 from .preprocess import Preprocessor
-from importlib.metadata import version
 __version__ = version("scdataloader")

{scdataloader-1.9.0 → scdataloader-1.9.2}/scdataloader/collator.py RENAMED Viewed

@@ -148,7 +148,6 @@ class Collator:
                         :, self.accepted_genes[organism_id]
                     ]
             if self.how == "most expr":
-                nnz_loc = np.where(expr > 0)[0]
                 if "knn_cells" in elem:
                     nnz_loc = np.where(expr + elem["knn_cells"].sum(0) > 0)[0]
                     ma = self.max_len if self.max_len < len(nnz_loc) else len(nnz_loc)
@@ -161,14 +160,18 @@ class Collator:
                 # loc = np.argsort(expr)[-(self.max_len) :][::-1]
             elif self.how == "random expr":
                 nnz_loc = np.where(expr > 0)[0]
-                loc = nnz_loc[
-                    np.random.choice(
-                        len(nnz_loc),
-                        self.max_len if self.max_len < len(nnz_loc) else len(nnz_loc),
-                        replace=False,
-                        # p=(expr.max() + (expr[nnz_loc])*19) / expr.max(), # 20 at most times more likely to be selected
-                    )
-                ]
+                loc = (
+                    nnz_loc[
+                        np.random.choice(
+                            len(nnz_loc),
+                            self.max_len,
+                            replace=False,
+                            # p=(expr.max() + (expr[nnz_loc])*19) / expr.max(), # 20 at most times more likely to be selected
+                        )
+                    ]
+                    if self.max_len < len(nnz_loc)
+                    else nnz_loc
+                )
             elif self.how in ["all", "some"]:
                 loc = np.arange(len(expr))
             else:
@@ -179,23 +182,19 @@ class Collator:
                 "all",
                 "some",
             ]:
+                ma = self.add_zero_genes + (
+                    0 if self.max_len < len(nnz_loc) else self.max_len - len(nnz_loc)
+                )
                 if "knn_cells" in elem:
                     # we complete with genes expressed in the knn
-                    nnz_loc = np.where(elem["knn_cells"].sum(0) > 0)[0]
-                    ma = self.max_len if self.max_len < len(nnz_loc) else len(nnz_loc)
                     # which is not a zero_loc in this context
-                    zero_loc = np.argsort(elem["knn_cells"].sum(0))[-(ma):][::-1]
+                    zero_loc = np.argsort(elem["knn_cells"].sum(0))[-ma:][::-1]
                 else:
                     zero_loc = np.where(expr == 0)[0]
                     zero_loc = zero_loc[
                         np.random.choice(
                             len(zero_loc),
-                            self.add_zero_genes
-                            + (
-                                0
-                                if self.max_len < len(nnz_loc)
-                                else self.max_len - len(nnz_loc)
-                            ),
+                            ma,
                             replace=False,
                         )
                     ]

{scdataloader-1.9.0 → scdataloader-1.9.2}/scdataloader/datamodule.py RENAMED Viewed

@@ -16,7 +16,7 @@ from torch.utils.data.sampler import (
 from .collator import Collator
 from .data import Dataset
-from .utils import getBiomartTable, slurm_restart_count
+from .utils import getBiomartTable
 FILE_DIR = os.path.dirname(os.path.abspath(__file__))
@@ -28,7 +28,7 @@ class DataModule(L.LightningDataModule):
         clss_to_weight: list = ["organism_ontology_term_id"],
         organisms: list = ["NCBITaxon:9606"],
         weight_scaler: int = 10,
-        train_oversampling_per_epoch: float = 0.1,
+        n_samples_per_epoch: int = 2_000_000,
         validation_split: float = 0.2,
         test_split: float = 0,
         gene_embeddings: str = "",
@@ -53,7 +53,6 @@ class DataModule(L.LightningDataModule):
         ],
         metacell_mode: float = 0.0,
         get_knn_cells: bool = False,
-        modify_seed_on_requeue: bool = True,
         **kwargs,
     ):
         """
@@ -67,7 +66,7 @@ class DataModule(L.LightningDataModule):
             collection_name (str): The lamindb collection to be used.
             organisms (list, optional): The organisms to include in the dataset. Defaults to ["NCBITaxon:9606"].
             weight_scaler (int, optional): how much more you will see the most present vs less present category.
-            train_oversampling_per_epoch (float, optional): The proportion of the dataset to include in the training set for each epoch. Defaults to 0.1.
+            n_samples_per_epoch (int, optional): The number of samples to include in the training set for each epoch. Defaults to 2_000_000.
             validation_split (float, optional): The proportion of the dataset to include in the validation split. Defaults to 0.2.
             test_split (float, optional): The proportion of the dataset to include in the test split. Defaults to 0.
                 it will use a full dataset and will round to the nearest dataset's cell count.
@@ -88,7 +87,6 @@ class DataModule(L.LightningDataModule):
             hierarchical_clss (list, optional): List of hierarchical classes. Defaults to [].
             metacell_mode (float, optional): The probability of using metacell mode. Defaults to 0.0.
             clss_to_predict (list, optional): List of classes to predict. Defaults to ["organism_ontology_term_id"].
-            modify_seed_on_requeue (bool, optional): Whether to modify the seed on requeue. Defaults to True.
             get_knn_cells (bool, optional): Whether to get the k-nearest neighbors of each queried cells. Defaults to False.
             **kwargs: Additional keyword arguments passed to the pytorch DataLoader.
             see @file data.py and @file collator.py for more details about some of the parameters
@@ -171,13 +169,11 @@ class DataModule(L.LightningDataModule):
         self.assays_to_drop = assays_to_drop
         self.n_samples = len(mdataset)
         self.weight_scaler = weight_scaler
-        self.train_oversampling_per_epoch = train_oversampling_per_epoch
+        self.n_samples_per_epoch = n_samples_per_epoch
         self.clss_to_weight = clss_to_weight
         self.train_weights = None
         self.train_labels = None
-        self.modify_seed_on_requeue = modify_seed_on_requeue
         self.nnz = None
-        self.restart_num = 0
         self.test_datasets = []
         self.test_idx = []
         super().__init__()
@@ -190,7 +186,7 @@ class DataModule(L.LightningDataModule):
             f"\ttest_split={self.test_split},\n"
             f"\tn_samples={self.n_samples},\n"
             f"\tweight_scaler={self.weight_scaler},\n"
-            f"\ttrain_oversampling_per_epoch={self.train_oversampling_per_epoch},\n"
+            f"\tn_samples_per_epoch={self.n_samples_per_epoch},\n"
             f"\tassays_to_drop={self.assays_to_drop},\n"
             f"\tnum_datasets={len(self.dataset.mapped_dataset.storages)},\n"
             f"\ttest datasets={str(self.test_datasets)},\n"
@@ -336,18 +332,16 @@ class DataModule(L.LightningDataModule):
     def train_dataloader(self, **kwargs):
         # train_sampler = WeightedRandomSampler(
         #    self.train_weights[self.train_labels],
-        #    int(self.n_samples*self.train_oversampling_per_epoch),
+        #    int(self.n_samples*self.n_samples_per_epoch),
         #    replacement=True,
         # )
         try:
             train_sampler = LabelWeightedSampler(
                 label_weights=self.train_weights,
                 labels=self.train_labels,
-                num_samples=int(self.n_samples * self.train_oversampling_per_epoch),
+                num_samples=int(self.n_samples_per_epoch),
                 element_weights=self.nnz,
                 replacement=self.replacement,
-                restart_num=self.restart_num,
-                modify_seed_on_requeue=self.modify_seed_on_requeue,
             )
         except ValueError as e:
             raise ValueError(e + "have you run `datamodule.setup()`?")
@@ -397,8 +391,6 @@ class LabelWeightedSampler(Sampler[int]):
     num_samples: int
     nnz: Optional[Sequence[int]]
     replacement: bool
-    restart_num: int
-    modify_seed_on_requeue: bool
     # when we use, just set weights for each classes(here is: np.ones(num_classes)), and labels of a dataset.
     # this will result a class-balanced sampling, no matter how imbalance the labels are.
@@ -409,15 +401,12 @@ class LabelWeightedSampler(Sampler[int]):
         num_samples: int,
         replacement: bool = True,
         element_weights: Sequence[float] = None,
-        restart_num: int = 0,
-        modify_seed_on_requeue: bool = True,
     ) -> None:
         """
         :param label_weights: list(len=num_classes)[float], weights for each class.
         :param labels: list(len=dataset_len)[int], labels of a dataset.
         :param num_samples: number of samples.
-        :param restart_num: if we are continuing a previous run, we need to restart the sampler from the same point.
         """
         super(LabelWeightedSampler, self).__init__(None)
@@ -433,8 +422,6 @@ class LabelWeightedSampler(Sampler[int]):
         )
         self.replacement = replacement
         self.num_samples = num_samples
-        self.restart_num = slurm_restart_count(use_mine=True) + restart_num
-        self.modify_seed_on_requeue = modify_seed_on_requeue
         # list of tensor.
         self.klass_indices = [
             (self.labels == i_klass).nonzero().squeeze(1)
@@ -447,9 +434,6 @@ class LabelWeightedSampler(Sampler[int]):
             self.label_weights,
             num_samples=self.num_samples,
             replacement=True,
-            generator=None
-            if self.restart_num == 0 and not self.modify_seed_on_requeue
-            else torch.Generator().manual_seed(self.restart_num),
         )
         sample_indices = torch.empty_like(sample_labels)
         for i_klass, klass_index in enumerate(self.klass_indices):
@@ -465,17 +449,11 @@ class LabelWeightedSampler(Sampler[int]):
                     if not self.replacement and len(klass_index) < len(left_inds)
                     else len(left_inds),
                     replacement=self.replacement,
-                    generator=None
-                    if self.restart_num == 0 and not self.modify_seed_on_requeue
-                    else torch.Generator().manual_seed(self.restart_num),
                 )
             elif self.replacement:
                 right_inds = torch.randint(
                     len(klass_index),
                     size=(len(left_inds),),
-                    generator=None
-                    if self.restart_num == 0 and not self.modify_seed_on_requeue
-                    else torch.Generator().manual_seed(self.restart_num),
                 )
             else:
                 maxelem = (
@@ -485,6 +463,7 @@ class LabelWeightedSampler(Sampler[int]):
                 )
                 right_inds = torch.randperm(len(klass_index))[:maxelem]
             sample_indices[left_inds[: len(right_inds)]] = klass_index[right_inds]
+            # if there are more left_inds than right_inds, we need to drop the extra ones
             if len(right_inds) < len(left_inds):
                 sample_indices[left_inds[len(right_inds) :]] = -1
         # drop all -1
@@ -492,7 +471,6 @@ class LabelWeightedSampler(Sampler[int]):
         # torch shuffle
         sample_indices = sample_indices[torch.randperm(len(sample_indices))]
         self.num_samples = len(sample_indices)
-        # raise Exception("stop")
         yield from iter(sample_indices.tolist())
     def __len__(self):

{scdataloader-1.9.0 → scdataloader-1.9.2}/scdataloader/mapped.py RENAMED Viewed

@@ -1,3 +1,10 @@
+# Portions of this file are derived from Lamin Labs
+# Copyright 2024 Lamin Labs
+# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
+# The rest of this file is licensed under MIT
+# Please see https://github.com/laminlabs/lamindb/blob/main/lamindb/core/_mapped_collection.py
+# for the original implementation
 from __future__ import annotations
 from collections import Counter

{scdataloader-1.9.0 → scdataloader-1.9.2}/scdataloader/preprocess.py RENAMED Viewed

@@ -1,3 +1,4 @@
+import gc
 from typing import Callable, Optional, Union
 from uuid import uuid4
@@ -9,7 +10,7 @@ import scanpy as sc
 from anndata import AnnData, read_h5ad
 from scipy.sparse import csr_matrix
 from upath import UPath
-import gc
 from scdataloader import utils as data_utils
 FULL_LENGTH_ASSAYS = [
@@ -270,10 +271,12 @@ class Preprocessor:
             var = ens_var
         adata = adata[:, var.index]
-        var = var.sort_values(by="ensembl_gene_id").set_index("ensembl_gene_id")
+        #        var = var.sort_values(by="ensembl_gene_id").set_index("ensembl_gene_id")
         # Update adata with combined genes
-        adata.var = var
-        genesdf = genesdf.set_index("ensembl_gene_id")
+        if "ensembl_gene_id" in var.columns:
+            adata.var = var.set_index("ensembl_gene_id")
+        else:
+            adata.var = var
         # Drop duplicate genes, keeping first occurrence
         adata = adata[:, ~adata.var.index.duplicated(keep="first")]
@@ -503,7 +506,7 @@ class LaminPreprocessor(Preprocessor):
                     continue
                 print(file)
-                path = cache_path(file) if self.force_preloaded else file.cache()
+                _ = cache_path(file) if self.force_preloaded else file.cache()
                 backed = file.open()
                 # backed = read_h5ad(path, backed="r")
                 if "is_primary_data" in backed.obs.columns:

scdataloader-1.9.0/scdataloader/VERSION DELETED Viewed

	@@ -1 +0,0 @@
1	- 1.9.0

{scdataloader-1.9.0 → scdataloader-1.9.2}/LICENSE RENAMED Viewed

File without changes

{scdataloader-1.9.0 → scdataloader-1.9.2}/scdataloader/__main__.py RENAMED Viewed

File without changes

{scdataloader-1.9.0 → scdataloader-1.9.2}/scdataloader/base.py RENAMED Viewed

File without changes

{scdataloader-1.9.0 → scdataloader-1.9.2}/scdataloader/config.py RENAMED Viewed

File without changes

{scdataloader-1.9.0 → scdataloader-1.9.2}/scdataloader/data.py RENAMED Viewed

File without changes

{scdataloader-1.9.0 → scdataloader-1.9.2}/scdataloader/utils.py RENAMED Viewed

File without changes

scdataloader 1.9.0__tar.gz → 1.9.2__tar.gz

scdataloader 1.9.0tar.gz → 1.9.2tar.gz