PyPI - scdataloader - Versions diffs - 0.0.3__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

scdataloader 0.0.3py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

scdataloader/VERSION +1 -1
scdataloader/__init__.py +1 -1
scdataloader/__main__.py +66 -42
scdataloader/collator.py +136 -67
scdataloader/config.py +112 -0
scdataloader/data.py +160 -169
scdataloader/datamodule.py +403 -0
scdataloader/mapped.py +285 -109
scdataloader/preprocess.py +240 -109
scdataloader/utils.py +162 -70
{scdataloader-0.0.3.dist-info → scdataloader-1.0.1.dist-info}/METADATA +87 -18
scdataloader-1.0.1.dist-info/RECORD +16 -0
scdataloader/dataloader.py +0 -318
scdataloader-0.0.3.dist-info/RECORD +0 -15
{scdataloader-0.0.3.dist-info → scdataloader-1.0.1.dist-info}/LICENSE +0 -0
{scdataloader-0.0.3.dist-info → scdataloader-1.0.1.dist-info}/WHEEL +0 -0
{scdataloader-0.0.3.dist-info → scdataloader-1.0.1.dist-info}/entry_points.txt +0 -0

scdataloader/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	-
1	+ 1.0.0

scdataloader/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
 from .data import Dataset
-from .dataloader import DataModule
+from .datamodule import DataModule
 from .preprocess import Preprocessor
 from .collator import *

scdataloader/__main__.py CHANGED Viewed

@@ -1,9 +1,18 @@
 import argparse
-from scdataloader.preprocess import LaminPreprocessor
+from scdataloader.preprocess import (
+    LaminPreprocessor,
+    additional_preprocess,
+    additional_postprocess,
+)
 import lamindb as ln
 from typing import Optional, Union
+# scdataloader --instance="laminlabs/cellxgene" --name="cellxgene-census" --version="2023-12-15" --description="preprocessed for scprint" --new_name="scprint main" --start_at=39
 def main():
+    """
+    main function to preprocess datasets in a given lamindb collection.
+    """
     parser = argparse.ArgumentParser(
         description="Preprocess datasets in a given lamindb collection."
     )
@@ -11,22 +20,31 @@ def main():
         "--name", type=str, required=True, help="Name of the input dataset"
     )
     parser.add_argument(
-        "--new_name", type=str, required=True, help="Name of the preprocessed dataset."
+        "--new_name",
+        type=str,
+        default="preprocessed dataset",
+        help="Name of the preprocessed dataset.",
     )
     parser.add_argument(
         "--description",
         type=str,
-        default="preprocessed by scDataLoader"
+        default="preprocessed by scDataLoader",
         help="Description of the preprocessed dataset.",
     )
     parser.add_argument(
         "--start_at", type=int, default=0, help="Position to start preprocessing at."
     )
     parser.add_argument(
-        "--new_version", type=str, default="2", help="Version of the output dataset and files."
+        "--new_version",
+        type=str,
+        default="2",
+        help="Version of the output dataset and files.",
     )
     parser.add_argument(
-        "--instance", type=str, default=None, help="Instance storing the input dataset, if not local"
+        "--instance",
+        type=str,
+        default=None,
+        help="Instance storing the input dataset, if not local",
     )
     parser.add_argument(
         "--version", type=str, default=None, help="Version of the input dataset."
@@ -35,125 +53,127 @@ def main():
         "--filter_gene_by_counts",
         type=Union[int, bool],
         default=False,
-        help="Determines whether to filter genes by counts."
+        help="Determines whether to filter genes by counts.",
     )
     parser.add_argument(
         "--filter_cell_by_counts",
         type=Union[int, bool],
         default=False,
-        help="Determines whether to filter cells by counts."
+        help="Determines whether to filter cells by counts.",
     )
     parser.add_argument(
         "--normalize_sum",
         type=float,
         default=1e4,
-        help="Determines whether to normalize the total counts of each cell to a specific value."
-    )
-    parser.add_argument(
-        "--keep_norm_layer",
-        type=bool,
-        default=False,
-        help="Determines whether to keep the normalization layer."
+        help="Determines whether to normalize the total counts of each cell to a specific value.",
     )
     parser.add_argument(
         "--subset_hvg",
         type=int,
         default=0,
-        help="Determines whether to subset highly variable genes."
+        help="Determines whether to subset highly variable genes.",
     )
     parser.add_argument(
         "--hvg_flavor",
         type=str,
         default="seurat_v3",
-        help="Specifies the flavor of highly variable genes selection."
+        help="Specifies the flavor of highly variable genes selection.",
     )
     parser.add_argument(
         "--binning",
         type=Optional[int],
         default=None,
-        help="Determines whether to bin the data into discrete values of number of bins provided."
+        help="Determines whether to bin the data into discrete values of number of bins provided.",
     )
     parser.add_argument(
         "--result_binned_key",
         type=str,
         default="X_binned",
-        help="Specifies the key of AnnData to store the binned data."
+        help="Specifies the key of AnnData to store the binned data.",
     )
     parser.add_argument(
         "--length_normalize",
         type=bool,
         default=False,
-        help="Determines whether to normalize the length."
+        help="Determines whether to normalize the length.",
     )
     parser.add_argument(
         "--force_preprocess",
         type=bool,
         default=False,
-        help="Determines whether to force preprocessing."
+        help="Determines whether to force preprocessing.",
     )
     parser.add_argument(
         "--min_dataset_size",
         type=int,
         default=100,
-        help="Specifies the minimum dataset size."
+        help="Specifies the minimum dataset size.",
     )
     parser.add_argument(
         "--min_valid_genes_id",
         type=int,
         default=10_000,
-        help="Specifies the minimum valid genes id."
+        help="Specifies the minimum valid genes id.",
     )
     parser.add_argument(
         "--min_nnz_genes",
         type=int,
-        default=200,
-        help="Specifies the minimum non-zero genes."
+        default=400,
+        help="Specifies the minimum non-zero genes.",
     )
     parser.add_argument(
         "--maxdropamount",
         type=int,
-        default=2,
-        help="Specifies the maximum drop amount."
+        default=50,
+        help="Specifies the maximum drop amount.",
     )
     parser.add_argument(
-        "--madoutlier",
-        type=int,
-        default=5,
-        help="Specifies the MAD outlier."
+        "--madoutlier", type=int, default=5, help="Specifies the MAD outlier."
     )
     parser.add_argument(
         "--pct_mt_outlier",
         type=int,
         default=8,
-        help="Specifies the percentage of MT outlier."
+        help="Specifies the percentage of MT outlier.",
     )
     parser.add_argument(
-        "--batch_key",
-        type=Optional[str],
-        default=None,
-        help="Specifies the batch key."
+        "--batch_key", type=Optional[str], default=None, help="Specifies the batch key."
     )
     parser.add_argument(
         "--skip_validate",
         type=bool,
         default=False,
-        help="Determines whether to skip validation."
+        help="Determines whether to skip validation.",
+    )
+    parser.add_argument(
+        "--do_postp",
+        type=bool,
+        default=False,
+        help="Determines whether to do postprocessing.",
     )
     args = parser.parse_args()
     # Load the collection
+    # if not args.preprocess:
+    #    print("Only preprocess is available for now")
+    #    return
     if args.instance is not None:
-        collection = ln.Collection.using(instance=args.instance).filter(name=args.name, version=args.version).first()
-    collection = ln.Collection.filter(name=args.name, version=args.version).first()
+        collection = (
+            ln.Collection.using(instance=args.instance)
+            .filter(name=args.name, version=args.version)
+            .first()
+        )
+    else:
+        collection = ln.Collection.filter(name=args.name, version=args.version).first()
-    print("using the dataset ",collection, " of size ",len(collection.artifacts.all()))
+    print(
+        "using the dataset ", collection, " of size ", len(collection.artifacts.all())
+    )
     # Initialize the preprocessor
     preprocessor = LaminPreprocessor(
         filter_gene_by_counts=args.filter_gene_by_counts,
         filter_cell_by_counts=args.filter_cell_by_counts,
         normalize_sum=args.normalize_sum,
-        keep_norm_layer=args.keep_norm_layer,
         subset_hvg=args.subset_hvg,
         hvg_flavor=args.hvg_flavor,
         binning=args.binning,
@@ -168,10 +188,14 @@ def main():
         pct_mt_outlier=args.pct_mt_outlier,
         batch_key=args.batch_key,
         skip_validate=args.skip_validate,
+        do_postp=args.do_postp,
+        additional_preprocess=additional_preprocess,
+        additional_postprocess=additional_postprocess,
+        keep_files=False,
     )
     # Preprocess the dataset
-    preprocessed_dataset = preprocessor(
+    preprocessor(
         collection,
         name=args.new_name,
         description=args.description,

scdataloader/collator.py CHANGED Viewed

@@ -1,25 +1,27 @@
 import numpy as np
-from .utils import load_genes
-from torch import Tensor
-# class SimpleCollator:
+from .utils import load_genes, downsample_profile
+from torch import Tensor, long
+from typing import Optional
 class Collator:
     def __init__(
         self,
-        organisms: list,
-        org_to_id: dict = None,
-        valid_genes: list = [],
-        max_len=2000,
-        n_bins=0,
-        add_zero_genes=200,
-        logp1=False,
-        norm_to=None,
-        how="all",
-        tp_name=None,
-        organism_name="organism_ontology_term_id",
-        class_names=[],
+        organisms: list[str],
+        how: str = "all",
+        org_to_id: dict[str, int] = None,
+        valid_genes: list[str] = [],
+        max_len: int = 2000,
+        add_zero_genes: int = 0,
+        logp1: bool = False,
+        norm_to: Optional[float] = None,
+        n_bins: int = 0,
+        tp_name: Optional[str] = None,
+        organism_name: str = "organism_ontology_term_id",
+        class_names: list[str] = [],
+        genelist: list[str] = [],
+        downsample: Optional[float] = None,  # don't use it for training!
+        save_output: bool = False,
     ):
         """
         This class is responsible for collating data for the scPRINT model. It handles the
@@ -27,51 +29,81 @@ class Collator:
         allowing for various configurations such as maximum gene list length, normalization,
         and selection method for gene expression.
+        This Collator should work with scVI's dataloader as well!
         Args:
             organisms (list): List of organisms to be considered for gene expression data.
+                it will drop any other organism it sees (might lead to batches of different sizes!)
+            how (flag, optional): Method for selecting gene expression. Defaults to "most expr".
+                one of ["most expr", "random expr", "all", "some"]:
+                "most expr": selects the max_len most expressed genes,
+                if less genes are expressed, will sample random unexpressed genes,
+                "random expr": uses a random set of max_len expressed genes.
+                if less genes are expressed, will sample random unexpressed genes
+                "all": uses all genes
+                "some": uses only the genes provided through the genelist param
             org_to_id (dict): Dictionary mapping organisms to their respective IDs.
-            labels (list, optional): List of labels for the data. Defaults to [].
             valid_genes (list, optional): List of genes from the datasets, to be considered. Defaults to [].
-            max_len (int, optional): Maximum length of the gene list. Defaults to 2000.
-            n_bins (int, optional): Number of bins for binning the data. Defaults to 0.
-            add_zero_genes (int, optional): Number of zero genes to add. Defaults to 200.
+                it will drop any other genes from the input expression data (usefull when your model only works on some genes)
+            max_len (int, optional): Total number of genes to use (for random expr and most expr). Defaults to 2000.
+            n_bins (int, optional): Number of bins for binning the data. Defaults to 0. meaning, no binning of expression.
+            add_zero_genes (int, optional): Number of additional unexpressed genes to add to the input data. Defaults to 0.
             logp1 (bool, optional): If True, logp1 normalization is applied. Defaults to False.
-            norm_to (str, optional): Normalization method to be applied. Defaults to None.
-            how (str, optional): Method for selecting gene expression. Defaults to "most expr".
+            norm_to (float, optional): Rescaling value of the normalization to be applied. Defaults to None.
+            organism_name (str, optional): Name of the organism ontology term id. Defaults to "organism_ontology_term_id".
+            tp_name (str, optional): Name of the heat diff. Defaults to None.
+            class_names (list, optional): List of other classes to be considered. Defaults to [].
+            genelist (list, optional): List of genes to be considered. Defaults to [].
+                If [] all genes will be considered
+            downsample (float, optional): Downsample the profile to a certain number of cells. Defaults to None.
+                This is usually done by the scPRINT model during training but this option allows you to do it directly from the collator
+            save_output (bool, optional): If True, saves the output to a file. Defaults to False.
+                This is mainly for debugging purposes
         """
         self.organisms = organisms
-        self.valid_genes = valid_genes
+        self.genedf = load_genes(organisms)
         self.max_len = max_len
         self.n_bins = n_bins
         self.add_zero_genes = add_zero_genes
         self.logp1 = logp1
         self.norm_to = norm_to
-        self.org_to_id = org_to_id
         self.how = how
-        self.organism_ids = (
-            set([org_to_id[k] for k in organisms])
-            if org_to_id is not None
-            else set(organisms)
-        )
+        if self.how == "some":
+            assert len(genelist) > 0, "if how is some, genelist must be provided"
         self.organism_name = organism_name
         self.tp_name = tp_name
         self.class_names = class_names
+        self.save_output = save_output
         self.start_idx = {}
         self.accepted_genes = {}
-        self.genedf = load_genes(organisms)
-        for organism in set(self.genedf.organism):
+        self.downsample = downsample
+        self.to_subset = {}
+        self._setup(org_to_id, valid_genes, genelist)
+    def _setup(self, org_to_id=None, valid_genes=[], genelist=[]):
+        self.org_to_id = org_to_id
+        self.to_subset = {}
+        self.accepted_genes = {}
+        self.start_idx = {}
+        self.organism_ids = (
+            set([org_to_id[k] for k in self.organisms])
+            if org_to_id is not None
+            else set(self.organisms)
+        )
+        for organism in self.organisms:
             ogenedf = self.genedf[self.genedf.organism == organism]
+            tot = self.genedf[self.genedf.index.isin(valid_genes)]
             org = org_to_id[organism] if org_to_id is not None else organism
-            self.start_idx.update(
-                {org: np.where(self.genedf.organism == organism)[0][0]}
-            )
+            self.start_idx.update({org: np.where(tot.organism == organism)[0][0]})
             if len(valid_genes) > 0:
                 self.accepted_genes.update({org: ogenedf.index.isin(valid_genes)})
+            if len(genelist) > 0:
+                df = ogenedf[ogenedf.index.isin(valid_genes)]
+                self.to_subset.update({org: df.index.isin(genelist)})
-    def __call__(self, batch):
+    def __call__(self, batch) -> dict[str, Tensor]:
         """
-        __call__ is a special method in Python that is called when an instance of the class is called.
+        __call__ applies the collator to a minibatch of data
         Args:
             batch (list[dict[str: array]]): List of dicts of arrays containing gene expression data.
@@ -92,33 +124,62 @@ class Collator:
         other_classes = []
         gene_locs = []
         tp = []
+        dataset = []
+        nnz_loc = []
         for elem in batch:
             organism_id = elem[self.organism_name]
             if organism_id not in self.organism_ids:
                 continue
+            if "_storage_idx" in elem:
+                dataset.append(elem["_storage_idx"])
             expr = np.array(elem["x"])
             total_count.append(expr.sum())
             if len(self.accepted_genes) > 0:
                 expr = expr[self.accepted_genes[organism_id]]
             if self.how == "most expr":
-                loc = np.argsort(expr)[-(self.max_len) :][::-1]
+                nnz_loc = np.where(expr > 0)[0]
+                ma = self.max_len if self.max_len < len(nnz_loc) else len(nnz_loc)
+                loc = np.argsort(expr)[-(ma):][::-1]
+                # nnz_loc = [1] * 30_000
+                # loc = np.argsort(expr)[-(self.max_len) :][::-1]
             elif self.how == "random expr":
                 nnz_loc = np.where(expr > 0)[0]
                 loc = nnz_loc[
-                    np.random.choice(len(nnz_loc), self.max_len, replace=False)
+                    np.random.choice(
+                        len(nnz_loc),
+                        self.max_len if self.max_len < len(nnz_loc) else len(nnz_loc),
+                        replace=False,
+                        # p=(expr.max() + (expr[nnz_loc])*19) / expr.max(), # 20 at most times more likely to be selected
+                    )
                 ]
-            elif self.how == "all":
+            elif self.how in ["all", "some"]:
                 loc = np.arange(len(expr))
             else:
                 raise ValueError("how must be either most expr or random expr")
-            if self.add_zero_genes > 0 and self.how != "all":
+            if (
+                (self.add_zero_genes > 0) or (self.max_len > len(nnz_loc))
+            ) and self.how not in ["all", "some"]:
                 zero_loc = np.where(expr == 0)[0]
-                zero_loc = [
-                    np.random.choice(len(zero_loc), self.add_zero_genes, replace=False)
+                zero_loc = zero_loc[
+                    np.random.choice(
+                        len(zero_loc),
+                        self.add_zero_genes
+                        + (
+                            0
+                            if self.max_len < len(nnz_loc)
+                            else self.max_len - len(nnz_loc)
+                        ),
+                        replace=False,
+                    )
                 ]
                 loc = np.concatenate((loc, zero_loc), axis=None)
-            exprs.append(expr[loc])
-            gene_locs.append(loc + self.start_idx[organism_id])
+            expr = expr[loc]
+            loc = loc + self.start_idx[organism_id]
+            if self.how == "some":
+                expr = expr[self.to_subset[organism_id]]
+                loc = loc[self.to_subset[organism_id]]
+            exprs.append(expr)
+            gene_locs.append(loc)
             if self.tp_name is not None:
                 tp.append(elem[self.tp_name])
@@ -132,6 +193,7 @@ class Collator:
         gene_locs = np.array(gene_locs)
         total_count = np.array(total_count)
         other_classes = np.array(other_classes)
+        dataset = np.array(dataset)
         # normalize counts
         if self.norm_to is not None:
@@ -152,20 +214,34 @@ class Collator:
         # do encoding of graph location
         # encode all the edges in some sparse way
         # normalizing total counts between 0,1
-        return {
+        ret = {
             "x": Tensor(expr),
             "genes": Tensor(gene_locs).int(),
             "class": Tensor(other_classes).int(),
             "tp": Tensor(tp),
             "depth": Tensor(total_count),
         }
+        if len(dataset) > 0:
+            ret.update({"dataset": Tensor(dataset).to(long)})
+        if self.downsample is not None:
+            ret["x"] = downsample_profile(ret["x"], self.downsample)
+        if self.save_output:
+            with open("collator_output.txt", "a") as f:
+                np.savetxt(f, ret["x"].numpy())
+        return ret
 class AnnDataCollator(Collator):
     def __init__(self, *args, **kwargs):
+        """
+        AnnDataCollator Collator to use if working with AnnData's experimental dataloader (it is very slow!!!)
+        Args:
+            @see Collator
+        """
         super().__init__(*args, **kwargs)
-    def __call__(self, batch):
+    def __call__(self, batch) -> dict[str, Tensor]:
         exprs = []
         total_count = []
         other_classes = []
@@ -218,28 +294,17 @@ class AnnDataCollator(Collator):
         }
-class SCVICollator(Collator):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-    def __call__(self, batch):
-        expr = batch["x"]
-        total_count = expr.sum(axis=1)
-        if self.how == "most expr":
-            loc = np.argsort(expr)[:, -(self.max_len) :][:, ::-1]
-        else:
-            raise ValueError("how must be either most expr or random expr")
-        if self.logp1:
-            expr = np.log2(1 + expr)
-        return {
-            "x": Tensor(expr[np.arange(expr.shape[0])[:, None], loc]),
-            "genes": Tensor(loc.copy()).int(),
-            "depth": Tensor(total_count),
-        }
+#############
+#### WIP ####
+#############
 class GeneformerCollator(Collator):
     def __init__(self, *args, gene_norm_list: list, **kwargs):
+        """
+        GeneformerCollator to finish
+        Args:
+            gene_norm_list (list): the normalization of expression through all datasets, per gene.
+        """
         super().__init__(*args, **kwargs)
         self.gene_norm_list = gene_norm_list
@@ -251,6 +316,10 @@ class GeneformerCollator(Collator):
 class scGPTCollator(Collator):
+    """
+    scGPTCollator to finish
+    """
     def __call__(self, batch):
         super().__call__(batch)
         # binning

scdataloader/config.py ADDED Viewed

@@ -0,0 +1,112 @@
+"""
+Configuration file for scDataLoader
+Missing labels are added to the dataset to complete a better hierarchical tree
+"""
+LABELS_TOADD = {
+    "assay_ontology_term_id": {
+        "10x transcription profiling": "EFO:0030003",
+        "spatial transcriptomics": "EFO:0008994",
+        "10x 3' transcription profiling": "EFO:0030003",
+        "10x 5' transcription profiling": "EFO:0030004",
+    },
+    "disease_ontology_term_id": {
+        "metabolic disease": "MONDO:0005066",
+        "chronic kidney disease": "MONDO:0005300",
+        "chromosomal disorder": "MONDO:0019040",
+        "infectious disease": "MONDO:0005550",
+        "inflammatory disease": "MONDO:0021166",
+        # "immune system disease",
+        "disorder of development or morphogenesis": "MONDO:0021147",
+        "mitochondrial disease": "MONDO:0044970",
+        "psychiatric disorder": "MONDO:0002025",
+        "cancer or benign tumor": "MONDO:0002025",
+        "neoplasm": "MONDO:0005070",
+    },
+    "cell_type_ontology_term_id": {
+        "progenitor cell": "CL:0011026",
+        "hematopoietic cell": "CL:0000988",
+        "myoblast": "CL:0000056",
+        "myeloid cell": "CL:0000763",
+        "neuron": "CL:0000540",
+        "electrically active cell": "CL:0000211",
+        "epithelial cell": "CL:0000066",
+        "secretory cell": "CL:0000151",
+        "stem cell": "CL:0000034",
+        "non-terminally differentiated cell": "CL:0000055",
+        "supporting cell": "CL:0000630",
+    },
+}
+COARSE_TISSUE = {
+    "adipose tissue": "",
+    "bladder organ": "",
+    "blood": "",
+    "bone marrow": "",
+    "brain": "",
+    "breast": "",
+    "esophagus": "",
+    "eye": "",
+    "embryo": "",
+    "fallopian tube": "",
+    "gall bladder": "",
+    "heart": "",
+    "intestine": "",
+    "kidney": "",
+    "liver": "",
+    "lung": "",
+    "lymph node": "",
+    "musculature of body": "",
+    "nose": "",
+    "ovary": "",
+    "pancreas": "",
+    "placenta": "",
+    "skin of body": "",
+    "spinal cord": "",
+    "spleen": "",
+    "stomach": "",
+    "thymus": "",
+    "thyroid gland": "",
+    "tongue": "",
+    "uterus": "",
+}
+COARSE_ANCESTRY = {
+    "African": "",
+    "Chinese": "",
+    "East Asian": "",
+    "Eskimo": "",
+    "European": "",
+    "Greater Middle Eastern  (Middle Eastern, North African or Persian)": "",
+    "Hispanic or Latin American": "",
+    "Native American": "",
+    "Oceanian": "",
+    "South Asian": "",
+}
+COARSE_DEVELOPMENT_STAGE = {
+    "Embryonic human": "",
+    "Fetal": "",
+    "Immature": "",
+    "Mature": "",
+}
+COARSE_ASSAY = {
+    "10x 3'": "",
+    "10x 5'": "",
+    "10x multiome": "",
+    "CEL-seq2": "",
+    "Drop-seq": "",
+    "GEXSCOPE technology": "",
+    "inDrop": "",
+    "microwell-seq": "",
+    "sci-Plex": "",
+    "sci-RNA-seq": "",
+    "Seq-Well": "",
+    "Slide-seq": "",
+    "Smart-seq": "",
+    "SPLiT-seq": "",
+    "TruDrop": "",
+    "Visium Spatial Gene Expression": "",
+}

scdataloader 0.0.3__py3-none-any.whl → 1.0.1__py3-none-any.whl

scdataloader 0.0.3py3-none-any.whl → 1.0.1py3-none-any.whl