PyPI - scdataloader - Versions diffs - 1.6.4__py3-none-any.whl → 1.8.0__py3-none-any.whl - Mend

scdataloader 1.6.4py3-none-any.whl → 1.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

scdataloader/VERSION +1 -1
scdataloader/__init__.py +2 -0
scdataloader/__main__.py +98 -36
scdataloader/collator.py +13 -7
scdataloader/config.py +99 -0
scdataloader/data.py +48 -35
scdataloader/datamodule.py +138 -44
scdataloader/mapped.py +656 -0
scdataloader/preprocess.py +239 -91
scdataloader/utils.py +71 -27
{scdataloader-1.6.4.dist-info → scdataloader-1.8.0.dist-info}/METADATA +10 -8
scdataloader-1.8.0.dist-info/RECORD +16 -0
{scdataloader-1.6.4.dist-info → scdataloader-1.8.0.dist-info}/WHEEL +1 -1
scdataloader-1.8.0.dist-info/entry_points.txt +2 -0
scdataloader-1.6.4.dist-info/RECORD +0 -14
{scdataloader-1.6.4.dist-info → scdataloader-1.8.0.dist-info}/licenses/LICENSE +0 -0

scdataloader/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 1.6.4
1	+ 1.8.0

scdataloader/__init__.py CHANGED Viewed

@@ -2,3 +2,5 @@ from .collator import Collator
 from .data import Dataset, SimpleAnnDataset
 from .datamodule import DataModule
 from .preprocess import Preprocessor
+__version__ = "1.7.0"

scdataloader/__main__.py CHANGED Viewed

@@ -10,157 +10,218 @@ from scdataloader.preprocess import (
 )
-# scdataloader --instance="laminlabs/cellxgene" --name="cellxgene-census" --version="2023-12-15" --description="preprocessed for scprint" --new_name="scprint main" --start_at=39
+# scdataloader --instance="laminlabs/cellxgene" --name="cellxgene-census" --version="2023-12-15" \
+# --description="scPRINT-V2 datasets" --new_name="scprint v2" --n_hvg_for_postp=4000 --cache=False \
+# --filter_gene_by_counts=0 --filter_cell_by_counts=300 --min_valid_genes_id=500 \
+# --min_nnz_genes=120 --min_dataset_size=100 --maxdropamount=90 \
+# --organisms=["NCBITaxon:9606","NCBITaxon:9544","NCBITaxon:9483","NCBITaxon:10090"] \
+# --start_at=0
 def main():
     """
-    main function to preprocess datasets in a given lamindb collection.
+    Main function to either preprocess datasets in a lamindb collection or populate ontologies.
     """
     parser = argparse.ArgumentParser(
-        description="Preprocess datasets in a given lamindb collection."
+        description="Preprocess datasets or populate ontologies."
     )
-    parser.add_argument(
+    subparsers = parser.add_subparsers(dest="command", help="Available commands")
+    # Preprocess command
+    preprocess_parser = subparsers.add_parser("preprocess", help="Preprocess datasets")
+    preprocess_parser.add_argument(
         "--name", type=str, required=True, help="Name of the input dataset"
     )
-    parser.add_argument(
+    preprocess_parser.add_argument(
         "--new_name",
         type=str,
         default="preprocessed dataset",
         help="Name of the preprocessed dataset.",
     )
-    parser.add_argument(
+    preprocess_parser.add_argument(
         "--description",
         type=str,
         default="preprocessed by scDataLoader",
         help="Description of the preprocessed dataset.",
     )
-    parser.add_argument(
+    preprocess_parser.add_argument(
         "--start_at", type=int, default=0, help="Position to start preprocessing at."
     )
-    parser.add_argument(
+    preprocess_parser.add_argument(
         "--new_version",
         type=str,
         default="2",
         help="Version of the output dataset and files.",
     )
-    parser.add_argument(
+    preprocess_parser.add_argument(
         "--instance",
         type=str,
         default=None,
         help="Instance storing the input dataset, if not local",
     )
-    parser.add_argument(
+    preprocess_parser.add_argument(
         "--version", type=str, default=None, help="Version of the input dataset."
     )
-    parser.add_argument(
+    preprocess_parser.add_argument(
         "--filter_gene_by_counts",
         type=int,
         default=0,
         help="Determines whether to filter genes by counts.",
     )
-    parser.add_argument(
+    preprocess_parser.add_argument(
         "--filter_cell_by_counts",
         type=int,
         default=0,
         help="Determines whether to filter cells by counts.",
     )
-    parser.add_argument(
+    preprocess_parser.add_argument(
         "--normalize_sum",
         type=float,
         default=1e4,
         help="Determines whether to normalize the total counts of each cell to a specific value.",
     )
-    parser.add_argument(
-        "--subset_hvg",
+    preprocess_parser.add_argument(
+        "--n_hvg_for_postp",
         type=int,
         default=0,
         help="Determines whether to subset highly variable genes.",
     )
-    parser.add_argument(
+    preprocess_parser.add_argument(
         "--hvg_flavor",
         type=str,
         default="seurat_v3",
         help="Specifies the flavor of highly variable genes selection.",
     )
-    parser.add_argument(
+    preprocess_parser.add_argument(
         "--binning",
         type=Optional[int],
         default=None,
         help="Determines whether to bin the data into discrete values of number of bins provided.",
     )
-    parser.add_argument(
+    preprocess_parser.add_argument(
         "--result_binned_key",
         type=str,
         default="X_binned",
         help="Specifies the key of AnnData to store the binned data.",
     )
-    parser.add_argument(
+    preprocess_parser.add_argument(
         "--length_normalize",
         type=bool,
         default=False,
         help="Determines whether to normalize the length.",
     )
-    parser.add_argument(
+    preprocess_parser.add_argument(
         "--force_preprocess",
         type=bool,
         default=False,
         help="Determines whether to force preprocessing.",
     )
-    parser.add_argument(
+    preprocess_parser.add_argument(
         "--min_dataset_size",
         type=int,
         default=100,
         help="Specifies the minimum dataset size.",
     )
-    parser.add_argument(
+    preprocess_parser.add_argument(
         "--min_valid_genes_id",
         type=int,
         default=10_000,
         help="Specifies the minimum valid genes id.",
     )
-    parser.add_argument(
+    preprocess_parser.add_argument(
         "--min_nnz_genes",
         type=int,
-        default=400,
+        default=200,
         help="Specifies the minimum non-zero genes.",
     )
-    parser.add_argument(
+    preprocess_parser.add_argument(
         "--maxdropamount",
         type=int,
         default=50,
         help="Specifies the maximum drop amount.",
     )
-    parser.add_argument(
+    preprocess_parser.add_argument(
         "--madoutlier", type=int, default=5, help="Specifies the MAD outlier."
     )
-    parser.add_argument(
+    preprocess_parser.add_argument(
         "--pct_mt_outlier",
         type=int,
         default=8,
         help="Specifies the percentage of MT outlier.",
     )
-    parser.add_argument(
-        "--batch_key", type=Optional[str], default=None, help="Specifies the batch key."
+    preprocess_parser.add_argument(
+        "--batch_keys",
+        type=list[str],
+        default=[
+            "assay_ontology_term_id",
+            "self_reported_ethnicity_ontology_term_id",
+            "sex_ontology_term_id",
+            "donor_id",
+            "suspension_type",
+        ],
+        help="Specifies the batch keys.",
     )
-    parser.add_argument(
+    preprocess_parser.add_argument(
         "--skip_validate",
         type=bool,
         default=False,
         help="Determines whether to skip validation.",
     )
-    parser.add_argument(
+    preprocess_parser.add_argument(
         "--do_postp",
         type=bool,
-        default=False,
+        default=True,
         help="Determines whether to do postprocessing.",
     )
-    parser.add_argument(
+    preprocess_parser.add_argument(
         "--cache",
         type=bool,
-        default=True,
+        default=False,
         help="Determines whether to cache the dataset.",
     )
+    preprocess_parser.add_argument(
+        "--organisms",
+        type=list,
+        default=[
+            "NCBITaxon:9606",
+            "NCBITaxon:10090",
+        ],
+        help="Determines the organisms to keep.",
+    )
+    preprocess_parser.add_argument(
+        "--force_preloaded",
+        type=bool,
+        default=False,
+        help="Determines whether the dataset is preloaded.",
+    )
+    # Populate command
+    populate_parser = subparsers.add_parser("populate", help="Populate ontologies")
+    populate_parser.add_argument(
+        "what",
+        nargs="?",
+        default="all",
+        choices=[
+            "all",
+            "organisms",
+            "celltypes",
+            "diseases",
+            "tissues",
+            "assays",
+            "ethnicities",
+            "sex",
+            "dev_stages",
+        ],
+        help="What ontologies to populate",
+    )
     args = parser.parse_args()
+    if args.command == "populate":
+        from scdataloader.utils import populate_my_ontology
+        if args.what != "all":
+            raise ValueError("Only 'all' is supported for now")
+        else:
+            populate_my_ontology()
+        return
     # Load the collection
     # if not args.preprocess:
     #    print("Only preprocess is available for now")
@@ -182,7 +243,7 @@ def main():
         filter_gene_by_counts=args.filter_gene_by_counts,
         filter_cell_by_counts=args.filter_cell_by_counts,
         normalize_sum=args.normalize_sum,
-        subset_hvg=args.subset_hvg,
+        n_hvg_for_postp=args.n_hvg_for_postp,
         hvg_flavor=args.hvg_flavor,
         cache=args.cache,
         binning=args.binning,
@@ -195,12 +256,13 @@ def main():
         maxdropamount=args.maxdropamount,
         madoutlier=args.madoutlier,
         pct_mt_outlier=args.pct_mt_outlier,
-        batch_key=args.batch_key,
+        batch_keys=args.batch_keys,
         skip_validate=args.skip_validate,
         do_postp=args.do_postp,
         additional_preprocess=additional_preprocess,
         additional_postprocess=additional_postprocess,
         keep_files=False,
+        force_preloaded=args.force_preloaded,
     )
     # Preprocess the dataset

scdataloader/collator.py CHANGED Viewed

@@ -23,7 +23,8 @@ class Collator:
         class_names: list[str] = [],
         genelist: list[str] = [],
         downsample: Optional[float] = None,  # don't use it for training!
-        save_output: bool = False,
+        save_output: Optional[str] = None,
+        metacell_mode: bool = False,
     ):
         """
         This class is responsible for collating data for the scPRINT model. It handles the
@@ -59,8 +60,9 @@ class Collator:
                 If [] all genes will be considered
             downsample (float, optional): Downsample the profile to a certain number of cells. Defaults to None.
                 This is usually done by the scPRINT model during training but this option allows you to do it directly from the collator
-            save_output (bool, optional): If True, saves the output to a file. Defaults to False.
+            save_output (str, optional): If not None, saves the output to a file. Defaults to None.
                 This is mainly for debugging purposes
+            metacell_mode (bool, optional): Whether to sample a metacell. Defaults to False.
         """
         self.organisms = organisms
         self.genedf = load_genes(organisms)
@@ -80,6 +82,7 @@ class Collator:
         self.accepted_genes = {}
         self.downsample = downsample
         self.to_subset = {}
+        self.metacell_mode = metacell_mode
         self._setup(org_to_id, valid_genes, genelist)
     def _setup(self, org_to_id=None, valid_genes=[], genelist=[]):
@@ -131,6 +134,7 @@ class Collator:
         tp = []
         dataset = []
         nnz_loc = []
+        is_meta = []
         for elem in batch:
             organism_id = elem[self.organism_name]
             if organism_id not in self.organism_ids:
@@ -193,16 +197,16 @@ class Collator:
                 tp.append(elem[self.tp_name])
             else:
                 tp.append(0)
+            if self.metacell_mode:
+                is_meta.append(elem["is_meta"])
             other_classes.append([elem[i] for i in self.class_names])
         expr = np.array(exprs)
         tp = np.array(tp)
         gene_locs = np.array(gene_locs)
         total_count = np.array(total_count)
         other_classes = np.array(other_classes)
         dataset = np.array(dataset)
+        is_meta = np.array(is_meta)
         # normalize counts
         if self.norm_to is not None:
             expr = (expr * self.norm_to) / total_count[:, None]
@@ -229,12 +233,14 @@ class Collator:
             "tp": Tensor(tp),
             "depth": Tensor(total_count),
         }
+        if self.metacell_mode:
+            ret.update({"is_meta": Tensor(is_meta).int()})
         if len(dataset) > 0:
             ret.update({"dataset": Tensor(dataset).to(long)})
         if self.downsample is not None:
             ret["x"] = downsample_profile(ret["x"], self.downsample)
-        if self.save_output:
-            with open("collator_output.txt", "a") as f:
+        if self.save_output is not None:
+            with open(self.save_output, "a") as f:
                 np.savetxt(f, ret["x"].numpy())
         return ret

scdataloader/config.py CHANGED Viewed

@@ -110,3 +110,102 @@ COARSE_ASSAY = {
     "TruDrop": "",
     "Visium Spatial Gene Expression": "",
 }
+MAIN_HUMAN_MOUSE_DEV_STAGE_MAP = {
+    "HsapDv:0010000": [
+        "MmusDv:0000092",  # postnatal stage
+    ],
+    "HsapDv:0000258": [  # mature stage
+        "MmusDv:0000110",  # mature stage
+        "HsapDv:0000204",
+    ],
+    "HsapDv:0000227": [  # late adult stage
+        "MmusDv:0000091",  # 20 month-old stage
+        "MmusDv:0000089",  # 18 month-old stage
+    ],
+    "HsapDv:0000272": [],  # 60-79 year-old stage
+    "HsapDv:0000095": [],  # 80 year-old and over stage
+    "HsapDv:0000267": [  # middle aged stage
+        "MmusDv:0000087",  # 16 month-old stage
+        "UBERON:0018241",  # prime adult stage
+        "MmusDv:0000083",  # 12 month-old stage
+        "HsapDv:0000092",  # same
+    ],
+    "HsapDv:0000266": [  # young adult stage
+        "MmusDv:0000050",  # 6 weeks
+        "HsapDv:0000089",  # same
+        "MmusDv:0000051",  # 7 weeks
+        "MmusDv:0000052",  # 8 weeks
+        "MmusDv:0000053",  # 9 weeks
+        "MmusDv:0000054",  # 10 weeks
+        "MmusDv:0000055",  # 11 weeks
+        "MmusDv:0000056",  # 12 weeks
+        "MmusDv:0000057",  # 13 weeks
+        "MmusDv:0000058",  # 14 weeks
+        "MmusDv:0000059",  # 15 weeks
+        "MmusDv:0000061",  # early adult stage
+        "MmusDv:0000062",  # 2 month-old stage
+        "MmusDv:0000063",  # 3 month-old stage
+        "MmusDv:0000064",  # 4 month-old stage
+        "MmusDv:0000065",  # 16 weeks
+        "MmusDv:0000066",  # 17 weeks
+        "MmusDv:0000067",  # 18 weeks
+        "MmusDv:0000068",  # 19 weeks
+        "MmusDv:0000070",  # 20 weeks
+        "MmusDv:0000071",  # 21 weeks
+        "MmusDv:0000072",  # 22 weeks
+        "MmusDv:0000073",  # 23 weeks
+        "MmusDv:0000074",  # 24 weeks
+        "MmusDv:0000077",  # 6 month-old stage
+        "MmusDv:0000079",  # 8 month-old stage
+        "MmusDv:0000098",  # 25 weeks
+        "MmusDv:0000099",  # 26 weeks
+        "MmusDv:0000102",  # 29 weeks
+    ],
+    "HsapDv:0000265": [],  # child stage (1-4 yo)
+    "HsapDv:0000271": [  # juvenile stage (5-14 yo)
+        "MmusDv:0000048",  # 4 weeks
+        "MmusDv:0000049",  # 5 weeks
+    ],
+    "HsapDv:0000260": [  # infant stage
+        "MmusDv:0000046",  # 2 weeks
+        "MmusDv:0000045",  # 1 week
+        "MmusDv:0000047",  # 3 weeks
+        "HsapDv:0000083",
+    ],
+    "HsapDv:0000262": [  # newborn stage (0-28 days)
+        "MmusDv:0000036",  # Theiler stage 27
+        "MmusDv:0000037",  # Theiler stage 28
+        "MmusDv:0000113",  # 4-7 days
+    ],
+    "HsapDv:0000007": [],  # Carnegie stage 03
+    "HsapDv:0000008": [],  # Carnegie stage 04
+    "HsapDv:0000009": [],  # Carnegie stage 05
+    "HsapDv:0000003": [],  # Carnegie stage 01
+    "HsapDv:0000005": [],  # Carnegie stage 02
+    "HsapDv:0000010": [],  # gastrula stage
+    "HsapDv:0000012": [],  # neurula stage
+    "HsapDv:0000015": [  # organogenesis stage
+        "MmusDv:0000019",  # Theiler stage 13
+        "MmusDv:0000020",  # Theiler stage 12
+        "MmusDv:0000021",  # Theiler stage 14
+        "MmusDv:0000022",  # Theiler stage 15
+        "MmusDv:0000023",  # Theiler stage 16
+        "MmusDv:0000024",  # Theiler stage 17
+        "MmusDv:0000025",  # Theiler stage 18
+        "MmusDv:0000026",  # Theiler stage 19
+        "MmusDv:0000027",  # Theiler stage 20
+        "MmusDv:0000028",  # Theiler stage 21
+        "MmusDv:0000029",  # Theiler stage 22
+    ],
+    "HsapDv:0000037": [  # fetal stage
+        "MmusDv:0000033",  # Theiler stage 24
+        "MmusDv:0000034",  # Theiler stage 25
+        "MmusDv:0000035",  # Theiler stage 26
+        "MmusDv:0000032",  # Theiler stage 23
+    ],
+    "unknown": [
+        "MmusDv:0000041",  # unknown
+    ],
+}

scdataloader/data.py CHANGED Viewed

@@ -10,8 +10,6 @@ import lamindb as ln
 import numpy as np
 import pandas as pd
 from anndata import AnnData
-from lamindb.core import MappedCollection
-from lamindb.core._mapped_collection import _Connect
 from lamindb.core.storage._anndata_accessor import _safer_read_index
 from scipy.sparse import issparse
 from torch.utils.data import Dataset as torchDataset
@@ -19,6 +17,7 @@ from torch.utils.data import Dataset as torchDataset
 from scdataloader.utils import get_ancestry_mapping, load_genes
 from .config import LABELS_TOADD
+from .mapped import MappedCollection, _Connect
 @dataclass
@@ -43,7 +42,7 @@ class Dataset(torchDataset):
         organisms (list[str]): list of organisms to load
             (for now only validates the the genes map to this organism)
         obs (list[str]): list of observations to load from the Collection
-        clss_to_pred (list[str]): list of observations to encode
+        clss_to_predict (list[str]): list of observations to encode
         join_vars (flag): join variables @see :meth:`~lamindb.Dataset.mapped`.
         hierarchical_clss: list of observations to map to a hierarchy using lamin's bionty
     """
@@ -53,37 +52,23 @@ class Dataset(torchDataset):
     organisms: Optional[Union[list[str], str]] = field(
         default_factory=["NCBITaxon:9606", "NCBITaxon:10090"]
     )
-    obs: Optional[list[str]] = field(
-        default_factory=[
-            "self_reported_ethnicity_ontology_term_id",
-            "assay_ontology_term_id",
-            "development_stage_ontology_term_id",
-            "disease_ontology_term_id",
-            "cell_type_ontology_term_id",
-            "tissue_ontology_term_id",
-            "sex_ontology_term_id",
-            #'dataset_id',
-            #'cell_culture',
-            # "dpt_group",
-            # "heat_diff",
-            # "nnz",
-        ]
-    )
     # set of obs to prepare for prediction (encode)
-    clss_to_pred: Optional[list[str]] = field(default_factory=list)
+    clss_to_predict: Optional[list[str]] = field(default_factory=list)
     # set of obs that need to be hierarchically prepared
     hierarchical_clss: Optional[list[str]] = field(default_factory=list)
     join_vars: Literal["inner", "outer"] | None = None
+    metacell_mode: float = 0.0
     def __post_init__(self):
         self.mapped_dataset = mapped(
             self.lamin_dataset,
-            obs_keys=self.obs,
+            obs_keys=list(set(self.hierarchical_clss + self.clss_to_predict)),
             join=self.join_vars,
-            encode_labels=self.clss_to_pred,
+            encode_labels=self.clss_to_predict,
             unknown_label="unknown",
             stream=True,
             parallel=True,
+            metacell_mode=self.metacell_mode,
         )
         print(
             "won't do any check but we recommend to have your dataset coming from local storage"
@@ -93,8 +78,8 @@ class Dataset(torchDataset):
         # generate tree from ontologies
         if len(self.hierarchical_clss) > 0:
             self.define_hierarchies(self.hierarchical_clss)
-        if len(self.clss_to_pred) > 0:
-            for clss in self.clss_to_pred:
+        if len(self.clss_to_predict) > 0:
+            for clss in self.clss_to_predict:
                 if clss not in self.hierarchical_clss:
                     # otherwise it's already been done
                     self.class_topred[clss] = set(
@@ -143,8 +128,7 @@ class Dataset(torchDataset):
             + "dataset contains:\n"
             + "     {} cells\n".format(self.mapped_dataset.__len__())
             + "     {} genes\n".format(self.genedf.shape[0])
-            + "     {} labels\n".format(len(self.obs))
-            + "     {} clss_to_pred\n".format(len(self.clss_to_pred))
+            + "     {} clss_to_predict\n".format(len(self.clss_to_predict))
             + "     {} hierarchical_clss\n".format(len(self.hierarchical_clss))
             + "     {} organisms\n".format(len(self.organisms))
             + (
@@ -154,9 +138,16 @@ class Dataset(torchDataset):
                 if len(self.class_topred) > 0
                 else ""
             )
+            + "     {} metacell_mode\n".format(self.metacell_mode)
         )
-    def get_label_weights(self, obs_keys: str | list[str], scaler: int = 10):
+    def get_label_weights(
+        self,
+        obs_keys: str | list[str],
+        scaler: int = 10,
+        return_categories=False,
+        bypass_label=["neuron"],
+    ):
         """Get all weights for the given label keys."""
         if isinstance(obs_keys, str):
             obs_keys = [obs_keys]
@@ -167,16 +158,24 @@ class Dataset(torchDataset):
             )
             labels_list.append(labels_to_str)
         if len(labels_list) > 1:
-            labels = reduce(lambda a, b: a + b, labels_list)
+            labels = ["___".join(labels_obs) for labels_obs in zip(*labels_list)]
         else:
             labels = labels_list[0]
         counter = Counter(labels)  # type: ignore
-        rn = {n: i for i, n in enumerate(counter.keys())}
-        labels = np.array([rn[label] for label in labels])
-        counter = np.array(list(counter.values()))
-        weights = scaler / (counter + scaler)
-        return weights, labels
+        if return_categories:
+            rn = {n: i for i, n in enumerate(counter.keys())}
+            labels = np.array([rn[label] for label in labels])
+            counter = np.array(list(counter.values()))
+            weights = scaler / (counter + scaler)
+            return weights, labels
+        else:
+            counts = np.array([counter[label] for label in labels])
+            if scaler is None:
+                weights = 1.0 / counts
+            else:
+                weights = scaler / (counts + scaler)
+            return weights
     def get_unseen_mapped_dataset_elements(self, idx: int):
         """
@@ -209,6 +208,8 @@ class Dataset(torchDataset):
                 "tissue_ontology_term_id",
                 "disease_ontology_term_id",
                 "development_stage_ontology_term_id",
+                "simplified_dev_stage",
+                "age_group",
                 "assay_ontology_term_id",
                 "self_reported_ethnicity_ontology_term_id",
             ]:
@@ -235,7 +236,11 @@ class Dataset(torchDataset):
                     .df(include=["parents__ontology_id"])
                     .set_index("ontology_id")
                 )
-            elif clss == "development_stage_ontology_term_id":
+            elif clss in [
+                "development_stage_ontology_term_id",
+                "simplified_dev_stage",
+                "age_group",
+            ]:
                 parentdf = (
                     bt.DevelopmentalStage.filter()
                     .df(include=["parents__ontology_id"])
@@ -268,7 +273,7 @@ class Dataset(torchDataset):
                 if len(j) == 0:
                     groupings.pop(i)
             self.labels_groupings[clss] = groupings
-            if clss in self.clss_to_pred:
+            if clss in self.clss_to_predict:
                 # if we have added new clss, we need to update the encoder with them too.
                 mlength = len(self.mapped_dataset.encoders[clss])
@@ -354,6 +359,8 @@ class SimpleAnnDataset(torchDataset):
 def mapped(
     dataset,
     obs_keys: list[str] | None = None,
+    obsm_keys: list[str] | None = None,
+    obs_filter: dict[str, str | tuple[str, ...]] | None = None,
     join: Literal["inner", "outer"] | None = "inner",
     encode_labels: bool | list[str] = True,
     unknown_label: str | dict[str, str] | None = None,
@@ -362,6 +369,8 @@ def mapped(
     dtype: str | None = None,
     stream: bool = False,
     is_run_input: bool | None = None,
+    metacell_mode: bool = False,
+    meta_assays: list[str] = ["EFO:0022857", "EFO:0010961"],
 ) -> MappedCollection:
     path_list = []
     for artifact in dataset.artifacts.all():
@@ -378,11 +387,15 @@ def mapped(
     ds = MappedCollection(
         path_list=path_list,
         obs_keys=obs_keys,
+        obsm_keys=obsm_keys,
+        obs_filter=obs_filter,
         join=join,
         encode_labels=encode_labels,
         unknown_label=unknown_label,
         cache_categories=cache_categories,
         parallel=parallel,
         dtype=dtype,
+        meta_assays=meta_assays,
+        metacell_mode=metacell_mode,
     )
     return ds

scdataloader 1.6.4__py3-none-any.whl → 1.8.0__py3-none-any.whl

scdataloader 1.6.4py3-none-any.whl → 1.8.0py3-none-any.whl