PyPI - scdataloader - Versions diffs - 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl - Mend

scdataloader 0.0.2py3-none-any.whl → 0.0.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

scdataloader/VERSION +1 -1
scdataloader/__init__.py +4 -0
scdataloader/__main__.py +209 -0
scdataloader/collator.py +307 -0
scdataloader/config.py +106 -0
scdataloader/data.py +181 -218
scdataloader/datamodule.py +375 -0
scdataloader/mapped.py +46 -32
scdataloader/preprocess.py +524 -208
scdataloader/utils.py +189 -123
{scdataloader-0.0.2.dist-info → scdataloader-0.0.4.dist-info}/METADATA +77 -7
scdataloader-0.0.4.dist-info/RECORD +16 -0
{scdataloader-0.0.2.dist-info → scdataloader-0.0.4.dist-info}/WHEEL +1 -1
scdataloader-0.0.2.dist-info/RECORD +0 -12
{scdataloader-0.0.2.dist-info → scdataloader-0.0.4.dist-info}/LICENSE +0 -0
{scdataloader-0.0.2.dist-info → scdataloader-0.0.4.dist-info}/entry_points.txt +0 -0

scdataloader/preprocess.py CHANGED Viewed

@@ -1,8 +1,7 @@
-from typing import Callable, Optional, Union
+from typing import Any, Callable, Optional, Union
 from uuid import uuid4
 import anndata as ad
-import bionty as bt
 import lamindb as ln
 import numpy as np
 import pandas as pd
@@ -10,6 +9,7 @@ import scanpy as sc
 from anndata import AnnData
 from django.db import IntegrityError
 from scipy.sparse import csr_matrix
+import os
 from scdataloader import utils as data_utils
@@ -19,6 +19,8 @@ FULL_LENGTH_ASSAYS = [
     "EFO:0008931",
 ]
+MAXFILESIZE = 10_000_000_000
 class Preprocessor:
     """
@@ -28,30 +30,31 @@ class Preprocessor:
     def __init__(
         self,
-        lb,
         filter_gene_by_counts: Union[int, bool] = False,
         filter_cell_by_counts: Union[int, bool] = False,
-        normalize_total: Union[float, bool] = False,
-        log1p: bool = False,
-        subset_hvg: Union[int, bool] = False,
+        normalize_sum: float = 1e4,
+        subset_hvg: int = 0,
+        use_layer: Optional[str] = None,
+        is_symbol: bool = False,
         hvg_flavor: str = "seurat_v3",
         binning: Optional[int] = None,
         result_binned_key: str = "X_binned",
         length_normalize: bool = False,
+        force_preprocess: bool = False,
+        min_dataset_size: int = 100,
+        min_valid_genes_id: int = 10_000,
+        min_nnz_genes: int = 200,
+        maxdropamount: int = 50,
+        madoutlier: int = 5,
+        pct_mt_outlier: int = 8,
+        batch_key: Optional[str] = None,
+        skip_validate: bool = False,
         additional_preprocess: Optional[Callable[[AnnData], AnnData]] = None,
         additional_postprocess: Optional[Callable[[AnnData], AnnData]] = None,
-        force_preprocess=False,
-        min_dataset_size=100,
-        min_valid_genes_id=10_000,
-        min_nnz_genes=200,
-        maxdropamount=2,
-        madoutlier=5,
-        pct_mt_outlier=8,
-        batch_key=None,
-        erase_prev_dataset: bool = False,
-        cache: bool = True,
-        stream: bool = False,
-    ):
+        do_postp: bool = True,
+        organisms: list[str] = ["NCBITaxon:9606", "NCBITaxon:10090"],
+        use_raw: bool = True,
+    ) -> None:
         """
         Initializes the preprocessor and configures the workflow steps.
@@ -60,7 +63,7 @@ class Preprocessor:
                 If int, filters genes with counts. Defaults to False.
             filter_cell_by_counts (int or bool, optional): Determines whether to filter cells by counts.
                 If int, filters cells with counts. Defaults to False.
-            normalize_total (float or bool, optional): Determines whether to normalize the total counts of each cell to a specific value.
+            normalize_sum (float or bool, optional): Determines whether to normalize the total counts of each cell to a specific value.
                 Defaults to 1e4.
             log1p (bool, optional): Determines whether to apply log1p transform to the normalized data.
                 Defaults to True.
@@ -71,19 +74,38 @@ class Preprocessor:
             binning (int, optional): Determines whether to bin the data into discrete values of number of bins provided.
             result_binned_key (str, optional): Specifies the key of :class:`~anndata.AnnData` to store the binned data.
                 Defaults to "X_binned".
+            length_normalize (bool, optional): Determines whether to length normalize the data.
+                Defaults to False.
+            force_preprocess (bool, optional): Determines whether to bypass the check of raw counts.
+                Defaults to False.
+            min_dataset_size (int, optional): The minimum size required for a dataset to be kept.
+                Defaults to 100.
+            min_valid_genes_id (int, optional): The minimum number of valid genes to keep a dataset.
+                Defaults to 10_000.
+            min_nnz_genes (int, optional): The minimum number of non-zero genes to keep a cell.
+                Defaults to 200.
+            maxdropamount (int, optional): The maximum amount of dropped cells per dataset. (2 for 50% drop, 3 for 33% drop, etc.)
+                Defaults to 2.
+            madoutlier (int, optional): The maximum absolute deviation of the outlier samples.
+                Defaults to 5.
+            pct_mt_outlier (int, optional): The maximum percentage of mitochondrial genes outlier.
+                Defaults to 8.
+            batch_key (str, optional): The key of :class:`~anndata.AnnData.obs` to use for batch information.
+                This arg is used in the highly variable gene selection step.
+            skip_validate (bool, optional): Determines whether to skip the validation step.
+                Defaults to False.
         """
         self.filter_gene_by_counts = filter_gene_by_counts
         self.filter_cell_by_counts = filter_cell_by_counts
-        self.normalize_total = normalize_total
-        self.log1p = log1p
+        self.normalize_sum = normalize_sum
         self.subset_hvg = subset_hvg
         self.hvg_flavor = hvg_flavor
         self.binning = binning
+        self.organisms = organisms
         self.result_binned_key = result_binned_key
         self.additional_preprocess = additional_preprocess
         self.additional_postprocess = additional_postprocess
         self.force_preprocess = force_preprocess
-        self.lb = lb
         self.min_dataset_size = min_dataset_size
         self.min_valid_genes_id = min_valid_genes_id
         self.min_nnz_genes = min_nnz_genes
@@ -91,124 +113,80 @@ class Preprocessor:
         self.madoutlier = madoutlier
         self.pct_mt_outlier = pct_mt_outlier
         self.batch_key = batch_key
-        self.erase_prev_dataset = erase_prev_dataset
         self.length_normalize = length_normalize
-        self.cache = cache
-        self.stream = stream
-    def __call__(
-        self,
-        data: Union[ln.Dataset, AnnData] = None,
-        name="preprocessed dataset",
-        description="preprocessed dataset using scprint",
-        start_at=0,
-    ):
-        """
-        format controls the different input value wrapping, including categorical
-        binned style, fixed-sum normalized counts, log1p fixed-sum normalized counts, etc.
-        Args:
-            adata (AnnData): The AnnData object to preprocess.
-            batch_key (str, optional): The key of AnnData.obs to use for batch information. This arg
-                is used in the highly variable gene selection step.
-        """
-        files = []
-        all_ready_processed_keys = set()
-        if self.cache:
-            for i in ln.Artifact.filter(description="preprocessed by scprint"):
-                all_ready_processed_keys.add(i.initial_version.key)
-        if isinstance(data, AnnData):
-            return self.preprocess(data)
-        elif isinstance(data, ln.Dataset):
-            for i, file in enumerate(data.artifacts.all()[start_at:]):
-                # use the counts matrix
-                print(i)
-                if file.key in all_ready_processed_keys:
-                    print(f"{file.key} is already processed")
-                    continue
-                print(file)
-                if file.backed().obs.is_primary_data.sum() == 0:
-                    print(f"{file.key} only contains non primary cells")
-                    continue
-                adata = file.load(stream=self.stream)
-                print(adata)
-                try:
-                    adata = self.preprocess(adata)
-                except ValueError as v:
-                    if v.args[0].startswith(
-                        "Dataset dropped because contains too many secondary"
-                    ):
-                        print(v)
-                        continue
-                    else:
-                        raise v
-                try:
-                    file.save()
-                except IntegrityError as e:
-                    # UNIQUE constraint failed: lnschema_bionty_organism.ontology_id
-                    print(f"seeing {e}... continuing")
-                myfile = ln.Artifact(
-                    adata,
-                    is_new_version_of=file,
-                    description="preprocessed by scprint",
-                )
-                # issues with KLlggfw6I6lvmbqiZm46
-                myfile.save()
-                files.append(myfile)
-            dataset = ln.Dataset(files, name=name, description=description)
-            dataset.save()
-            return dataset
-        else:
-            raise ValueError("Please provide either anndata or ln.Dataset")
-    def preprocess(self, adata: AnnData):
+        self.skip_validate = skip_validate
+        self.use_layer = use_layer
+        self.is_symbol = is_symbol
+        self.do_postp = do_postp
+        self.use_raw = use_raw
+    def __call__(self, adata) -> AnnData:
+        if adata[0].obs.organism_ontology_term_id.iloc[0] not in self.organisms:
+            raise ValueError(
+                "we cannot work with this organism",
+                adata[0].obs.organism_ontology_term_id.iloc[0],
+            )
         if self.additional_preprocess is not None:
             adata = self.additional_preprocess(adata)
-        if adata.raw is not None:
+        if adata.raw is not None and self.use_raw:
             adata.X = adata.raw.X
             del adata.raw
+        if self.use_layer is not None:
+            adata.X = adata.layers[self.use_layer]
         if adata.layers is not None:
+            if "counts" in adata.layers.keys():
+                if np.abs(adata[:50_000].X.astype(int) - adata[:50_000].X).sum():
+                    print("X was not raw counts, using 'counts' layer")
+                    adata.X = adata.layers["counts"].copy()
+            print("Dropping layers: ", adata.layers.keys())
             del adata.layers
         if len(adata.varm.keys()) > 0:
             del adata.varm
-        if len(adata.obsm.keys()) > 0:
+        if len(adata.obsm.keys()) > 0 and self.do_postp:
             del adata.obsm
-        if len(adata.obsp.keys()) > 0:
+        if len(adata.obsp.keys()) > 0 and self.do_postp:
             del adata.obsp
         if len(adata.uns.keys()) > 0:
             del adata.uns
         if len(adata.varp.keys()) > 0:
             del adata.varp
         # check that it is a count
-        if (
-            int(adata.X[:100].max()) != adata.X[:100].max()
-            and not self.force_preprocess
-        ):  # check if likely raw data
-            raise ValueError(
-                "Data is not raw counts, please check layers, find raw data, or bypass with force_preprocess"
-            )
+        print("checking raw counts")
+        if np.abs(
+            adata[:50_000].X.astype(int) - adata[:50_000].X
+        ).sum():  # check if likely raw data
+            if not self.force_preprocess:
+                raise ValueError(
+                    "Data is not raw counts, please check layers, find raw data, or bypass with force_preprocess"
+                )
+            else:
+                print(
+                    "Data is not raw counts, please check layers, find raw data, or bypass with force_preprocess"
+                )
             # please check layers
             # if not available count drop
+        prevsize = adata.shape[0]
+        # dropping non primary
+        if "is_primary_data" in adata.obs.columns:
+            adata = adata[adata.obs.is_primary_data]
+        if adata.shape[0] < self.min_dataset_size:
+            raise Exception("Dataset dropped due to too many secondary cells")
+        print(
+            "removed {} non primary cells, {} renamining".format(
+                prevsize - adata.shape[0], adata.shape[0]
+            )
+        )
         # # cleanup and dropping low expressed genes and unexpressed cells
         prevsize = adata.shape[0]
         adata.obs["nnz"] = np.array(np.sum(adata.X != 0, axis=1).flatten())[0]
-        adata = adata[
-            (adata.obs["nnz"] > self.min_nnz_genes)
-            # or if slide-seq
-            | (
-                (adata.obs.assay_ontology_term_id == "EFO:0030062")
-                & (adata.obs["nnz"] > (self.min_nnz_genes / 3))
-            )
-        ]
+        adata = adata[(adata.obs["nnz"] > self.min_nnz_genes)]
         if self.filter_gene_by_counts:
             sc.pp.filter_genes(adata, min_counts=self.filter_gene_by_counts)
         if self.filter_cell_by_counts:
             sc.pp.filter_cells(adata, min_counts=self.filter_cell_by_counts)
         # if lost > 50% of the dataset, drop dataset
         # load the genes
-        genesdf = self.load_genes(adata.obs.organism_ontology_term_id[0])
+        genesdf = data_utils.load_genes(adata.obs.organism_ontology_term_id.iloc[0])
         if prevsize / adata.shape[0] > self.maxdropamount:
             raise Exception(
@@ -220,19 +198,32 @@ class Preprocessor:
                 "Dataset dropped due to low expressed genes and unexpressed cells: current size: "
                 + str(adata.shape[0])
             )
-        # dropping non primary
-        adata = adata[adata.obs.is_primary_data]
-        if adata.shape[0] < self.min_dataset_size:
-            raise ValueError(
-                "Dataset dropped because contains too many secondary cells"
+        print(
+            "filtered out {} cells, {} renamining".format(
+                prevsize - adata.shape[0], adata.shape[0]
             )
+        )
+        if self.is_symbol:
+            genesdf["ensembl_gene_id"] = genesdf.index
+            var = (
+                adata.var.merge(
+                    genesdf.drop_duplicates("symbol").set_index("symbol", drop=False),
+                    left_index=True,
+                    right_index=True,
+                    how="inner",
+                )
+                .sort_values(by="ensembl_gene_id")
+                .set_index("ensembl_gene_id")
+            )
+            adata = adata[:, var["symbol"]]
+            adata.var = var
+            genesdf = genesdf.set_index("ensembl_gene_id")
         intersect_genes = set(adata.var.index).intersection(set(genesdf.index))
         print(f"Removed {len(adata.var.index) - len(intersect_genes)} genes.")
         if len(intersect_genes) < self.min_valid_genes_id:
-            raise Exception(
-                "Dataset dropped due to too many genes not mapping to it"
-            )
+            raise Exception("Dataset dropped due to too many genes not mapping to it")
         adata = adata[:, list(intersect_genes)]
         # marking unseen genes
         unseen = set(genesdf.index) - set(adata.var.index)
@@ -245,60 +236,55 @@ class Preprocessor:
         adata = ad.concat([adata, emptyda], axis=1, join="outer", merge="only")
         # do a validation function
         adata.uns["unseen_genes"] = list(unseen)
-        data_utils.validate(
-            adata, self.lb, organism=adata.obs.organism_ontology_term_id[0]
-        )
-        # length normalization
-        if (
-            adata.obs["assay_ontology_term_id"].isin(FULL_LENGTH_ASSAYS).any()
-            and self.length_normalize
-        ):
-            subadata = data_utils.length_normalize(
-                adata[
-                    adata.obs["assay_ontology_term_id"].isin(
-                        FULL_LENGTH_ASSAYS
-                    )
-                ],
-            )
+        if not self.skip_validate:
+            print("validating")
+            data_utils.validate(adata, organism=adata.obs.organism_ontology_term_id[0])
+            # length normalization
+            if (
+                adata.obs["assay_ontology_term_id"].isin(FULL_LENGTH_ASSAYS).any()
+                and self.length_normalize
+            ):
+                print("doing length norm")
+                subadata = data_utils.length_normalize(
+                    adata[adata.obs["assay_ontology_term_id"].isin(FULL_LENGTH_ASSAYS)],
+                )
-            adata = ad.concat(
-                [
-                    adata[
-                        ~adata.obs["assay_ontology_term_id"].isin(
-                            FULL_LENGTH_ASSAYS
-                        )
+                adata = ad.concat(
+                    [
+                        adata[
+                            ~adata.obs["assay_ontology_term_id"].isin(
+                                FULL_LENGTH_ASSAYS
+                            )
+                        ],
+                        subadata,
                     ],
-                    subadata,
-                ],
-                axis=0,
-                join="outer",
-                merge="only",
-            )
-        # step 3: normalize total
-        if self.normalize_total:
-            sc.pp.normalize_total(adata, target_sum=self.normalize_total)
-        if self.log1p and not is_log1p(adata):
-            sc.pp.log1p(adata)
+                    axis=0,
+                    join="outer",
+                    merge="only",
+                )
         # QC
         adata.var[genesdf.columns] = genesdf.loc[adata.var.index]
+        for name in ["stable_id", "created_at", "updated_at"]:
+            if name in adata.var.columns:
+                adata.var = adata.var.drop(columns=name)
+        print("startin QC")
         sc.pp.calculate_qc_metrics(
             adata, qc_vars=["mt", "ribo", "hb"], inplace=True, percent_top=[20]
         )
         adata.obs["outlier"] = (
             data_utils.is_outlier(adata, "total_counts", self.madoutlier)
-            | data_utils.is_outlier(
-                adata, "n_genes_by_counts", self.madoutlier
-            )
+            | data_utils.is_outlier(adata, "n_genes_by_counts", self.madoutlier)
             | data_utils.is_outlier(
                 adata, "pct_counts_in_top_20_genes", self.madoutlier
             )
         )
-        adata.obs["mt_outlier"] = data_utils.is_outlier(
-            adata, "pct_counts_mt", 3
-        ) | (adata.obs["pct_counts_mt"] > self.pct_mt_outlier)
+        adata.obs["mt_outlier"] = data_utils.is_outlier(adata, "pct_counts_mt", 3) | (
+            adata.obs["pct_counts_mt"] > self.pct_mt_outlier
+        )
         total_outliers = (adata.obs["outlier"] | adata.obs["mt_outlier"]).sum()
         total_cells = adata.shape[0]
         percentage_outliers = (total_outliers / total_cells) * 100
@@ -309,27 +295,38 @@ class Preprocessor:
         #    raise Exception("More than 50% of the dataset has been dropped due to outliers.")
         # adata = adata[(~adata.obs.outlier) & (~adata.obs.mt_outlier)].copy()
         # remaining
-        # step 5: subset hvg
-        if self.subset_hvg:
-            sc.pp.highly_variable_genes(
-                adata,
-                n_top_genes=self.subset_hvg,
-                batch_key=self.batch_key,
-                flavor=self.hvg_flavor,
-                subset=True,
-            )
         # based on the topometry paper https://www.biorxiv.org/content/10.1101/2022.03.14.484134v2
         # https://rapids-singlecell.readthedocs.io/en/latest/api/generated/rapids_singlecell.pp.pca.html#rapids_singlecell.pp.pca
-        sc.pp.neighbors(
-            adata, n_pcs=500 if adata.shape[0] > 500 else adata.shape[0] - 2
-        )
-        sc.tl.leiden(adata, key_added="leiden_3", resolution=3.0)
-        sc.tl.leiden(adata, key_added="leiden_2", resolution=2.0)
-        sc.tl.leiden(adata, key_added="leiden_1", resolution=1.0)
-        sc.tl.umap(adata)
-        # additional
-        if self.additional_postprocess is not None:
-            adata = self.additional_postprocess(adata)
+        if self.do_postp:
+            print("normalize")
+            adata.layers["clean"] = sc.pp.log1p(
+                sc.pp.normalize_total(
+                    adata, target_sum=self.normalize_sum, inplace=False
+                )["X"]
+            )
+            # step 5: subset hvg
+            if self.subset_hvg:
+                sc.pp.highly_variable_genes(
+                    adata,
+                    layer="clean",
+                    n_top_genes=self.subset_hvg,
+                    batch_key=self.batch_key,
+                    flavor=self.hvg_flavor,
+                    subset=False,
+                )
+            adata.obsm["clean_pca"] = sc.pp.pca(
+                adata.layers["clean"],
+                n_comps=300 if adata.shape[0] > 300 else adata.shape[0] - 2,
+            )
+            sc.pp.neighbors(adata, use_rep="clean_pca")
+            sc.tl.leiden(adata, key_added="leiden_3", resolution=3.0)
+            sc.tl.leiden(adata, key_added="leiden_2", resolution=2.0)
+            sc.tl.leiden(adata, key_added="leiden_1", resolution=1.0)
+            sc.tl.umap(adata)
+            # additional
+            if self.additional_postprocess is not None:
+                adata = self.additional_postprocess(adata)
         adata = adata[:, adata.var.sort_index().index]
         # create random ids for all cells
         adata.obs.index = [str(uuid4()) for _ in range(adata.shape[0])]
@@ -341,9 +338,7 @@ class Preprocessor:
             print("Binning data ...")
             if not isinstance(self.binning, int):
                 raise ValueError(
-                    "Binning arg must be an integer, but got {}.".format(
-                        self.binning
-                    )
+                    "Binning arg must be an integer, but got {}.".format(self.binning)
                 )
             # NOTE: the first bin is always a spectial for zero
             n_bins = self.binning
@@ -379,23 +374,140 @@ class Preprocessor:
                 bin_edges.append(np.concatenate([[0], bins]))
             adata.layers[self.result_binned_key] = np.stack(binned_rows)
             adata.obsm["bin_edges"] = np.stack(bin_edges)
+        print("done")
         return adata
-    def load_genes(self, organism):
-        genesdf = bt.Gene(
-            organism=self.lb.Organism.filter(ontology_id=organism).first().name
-        ).df()
-        genesdf = genesdf.drop_duplicates(subset="ensembl_gene_id")
-        genesdf = genesdf.set_index("ensembl_gene_id")
-        # mitochondrial genes
-        genesdf["mt"] = genesdf.symbol.astype(str).str.startswith("MT-")
-        # ribosomal genes
-        genesdf["ribo"] = genesdf.symbol.astype(str).str.startswith(
-            ("RPS", "RPL")
-        )
-        # hemoglobin genes.
-        genesdf["hb"] = genesdf.symbol.astype(str).str.contains(("^HB[^(P)]"))
-        return genesdf
+class LaminPreprocessor(Preprocessor):
+    def __init__(
+        self,
+        *args,
+        erase_prev_dataset: bool = False,
+        cache: bool = True,
+        stream: bool = False,
+        keep_files: bool = True,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.erase_prev_dataset = erase_prev_dataset
+        self.cache = cache
+        self.stream = stream
+        self.keep_files = keep_files
+    def __call__(
+        self,
+        data: Union[ln.Collection, AnnData] = None,
+        name="preprocessed dataset",
+        description="preprocessed dataset using scprint",
+        start_at=0,
+        version=2,
+    ):
+        """
+        format controls the different input value wrapping, including categorical
+        binned style, fixed-sum normalized counts, log1p fixed-sum normalized counts, etc.
+        Args:
+            adata (AnnData): The AnnData object to preprocess.
+            batch_key (str, optional): The key of AnnData.obs to use for batch information. This arg
+                is used in the highly variable gene selection step.
+        """
+        files = []
+        all_ready_processed_keys = set()
+        if self.cache:
+            for i in ln.Artifact.filter(description=description):
+                all_ready_processed_keys.add(i.stem_uid)
+        if isinstance(data, AnnData):
+            return super().__call__(data)
+        elif isinstance(data, ln.Collection):
+            for i, file in enumerate(data.artifacts.all()[start_at:]):
+                # use the counts matrix
+                print(i)
+                if file.stem_uid in all_ready_processed_keys:
+                    print(f"{file.stem_uid} is already processed... not preprocessing")
+                    continue
+                print(file)
+                backed = file.backed()
+                if backed.obs.is_primary_data.sum() == 0:
+                    print(f"{file.key} only contains non primary cells.. dropping")
+                    continue
+                if backed.shape[1] < 1000:
+                    print(
+                        f"{file.key} only contains less than 1000 genes and is likely not scRNAseq... dropping"
+                    )
+                    continue
+                if file.size <= MAXFILESIZE:
+                    adata = file.load(stream=self.stream)
+                    print(adata)
+                else:
+                    badata = backed
+                    print(badata)
+                try:
+                    if file.size > MAXFILESIZE:
+                        print(
+                            f"dividing the dataset as it is too large: {file.size//1_000_000_000}Gb"
+                        )
+                        num_blocks = int(np.ceil(file.size / (MAXFILESIZE / 2)))
+                        block_size = int(
+                            (np.ceil(badata.shape[0] / 30_000) * 30_000) // num_blocks
+                        )
+                        print("num blocks ", num_blocks)
+                        for i in range(num_blocks):
+                            start_index = i * block_size
+                            end_index = min((i + 1) * block_size, badata.shape[0])
+                            block = badata[start_index:end_index].to_memory()
+                            print(block)
+                            block = super().__call__(block)
+                            myfile = ln.Artifact(
+                                block,
+                                is_new_version_of=file,
+                                description=description,
+                                version=str(version) + "_s" + str(i),
+                            )
+                            myfile.save()
+                            if self.keep_files:
+                                files.append(myfile)
+                            else:
+                                del myfile
+                                del block
+                    else:
+                        adata = super().__call__(adata)
+                        myfile = ln.Artifact(
+                            adata,
+                            is_new_version_of=file,
+                            description=description,
+                            version=str(version),
+                        )
+                        myfile.save()
+                        if self.keep_files:
+                            files.append(myfile)
+                        else:
+                            del myfile
+                            del adata
+                except ValueError as v:
+                    if v.args[0].startswith("we cannot work with this organism"):
+                        print(v)
+                        continue
+                    else:
+                        raise v
+                except Exception as e:
+                    if e.args[0].startswith("Dataset dropped due to"):
+                        print(e)
+                        continue
+                    else:
+                        raise e
+                # issues with KLlggfw6I6lvmbqiZm46
+            if self.keep_files:
+                dataset = ln.Collection(files, name=name, description=description)
+                dataset.save()
+                return dataset
+            else:
+                return
+        else:
+            raise ValueError("Please provide either anndata or ln.Collection")
 def is_log1p(adata: AnnData) -> bool:
@@ -494,7 +606,7 @@ def additional_preprocess(adata):
     adata.obs["cell_culture"] = False
     # if cell_type contains the word "(cell culture)" then it is a cell culture and we mark it as so and remove this from the cell type
     loc = adata.obs["cell_type_ontology_term_id"].str.contains(
-        "(cell culture)"
+        "(cell culture)", regex=False
     )
     if loc.sum() > 0:
         adata.obs["cell_type_ontology_term_id"] = adata.obs[
@@ -505,7 +617,9 @@ def additional_preprocess(adata):
             loc, "cell_type_ontology_term_id"
         ].str.replace(" (cell culture)", "")
-    loc = adata.obs["tissue_ontology_term_id"].str.contains("(cell culture)")
+    loc = adata.obs["tissue_ontology_term_id"].str.contains(
+        "(cell culture)", regex=False
+    )
     if loc.sum() > 0:
         adata.obs.loc[loc, "cell_culture"] = True
         adata.obs["tissue_ontology_term_id"] = adata.obs[
@@ -513,9 +627,9 @@ def additional_preprocess(adata):
         ].astype(str)
         adata.obs.loc[loc, "tissue_ontology_term_id"] = adata.obs.loc[
             loc, "tissue_ontology_term_id"
-        ].str.replace(r" \(cell culture\)", "")
+        ].str.replace(" (cell culture)", "")
-    loc = adata.obs["tissue_ontology_term_id"].str.contains("(organoid)")
+    loc = adata.obs["tissue_ontology_term_id"].str.contains("(organoid)", regex=False)
     if loc.sum() > 0:
         adata.obs.loc[loc, "cell_culture"] = True
         adata.obs["tissue_ontology_term_id"] = adata.obs[
@@ -523,9 +637,9 @@ def additional_preprocess(adata):
         ].astype(str)
         adata.obs.loc[loc, "tissue_ontology_term_id"] = adata.obs.loc[
             loc, "tissue_ontology_term_id"
-        ].str.replace(r" \(organoid\)", "")
+        ].str.replace(" (organoid)", "")
-    loc = adata.obs["tissue_ontology_term_id"].str.contains("CL:")
+    loc = adata.obs["tissue_ontology_term_id"].str.contains("CL:", regex=False)
     if loc.sum() > 0:
         adata.obs["tissue_ontology_term_id"] = adata.obs[
             "tissue_ontology_term_id"
@@ -553,12 +667,8 @@ def additional_postprocess(adata):
     )  # + "_" + adata.obs['dataset_id'].astype(str)
     # if group is too small
-    okgroup = [
-        i for i, j in adata.obs["dpt_group"].value_counts().items() if j >= 10
-    ]
-    not_okgroup = [
-        i for i, j in adata.obs["dpt_group"].value_counts().items() if j < 3
-    ]
+    okgroup = [i for i, j in adata.obs["dpt_group"].value_counts().items() if j >= 10]
+    not_okgroup = [i for i, j in adata.obs["dpt_group"].value_counts().items() if j < 3]
     # set the group to empty
     adata.obs.loc[adata.obs["dpt_group"].isin(not_okgroup), "dpt_group"] = ""
     adata.obs["heat_diff"] = np.nan
@@ -582,3 +692,209 @@ def additional_postprocess(adata):
     # to query N next time points we just get the N elements below and check they are in the group
     # to query the N nearest neighbors we just get the N elements above and N below and check they are in the group
     return adata
+"""
+sexr = {
+    "Male": "PATO:0000384",
+    "Female": "PATO:0000383",
+}
+tissuer = {
+    "Kidney": "UBERON:0002113",
+    "Lung": "UBERON:0002048",
+    "Heart": "UBERON:0000948",
+    "Liver": "UBERON:0002107",
+    "Brain": "UBERON:0000955",
+    "BAT": "UBERON:0001348",
+    "Jejunum": "UBERON:0002115",
+    "Colon": "UBERON:0001155",
+    "Ileum": "UBERON:0002116",
+    "Stomach": "UBERON:0000945",
+    "gWAT": "UBERON:0001347",
+    "Duodenum": "UBERON:0002114",
+    "iWAT": "UBERON:0001347",
+    "Muscle": "UBERON:0001630",
+}
+ager = {
+    "03_months": "MmusDv:0000063",
+    "16_months": "MmusDv:0000087",
+    "06_months": "MmusDv:0000077",
+    "23_months": "MmusDv:0000127",
+    "12_months": "MmusDv:0000083",
+    "21_months": "MmusDv:0000125",
+}
+celltyper = {
+    "Proximal tubule cells": "epithelial cell of proximal tubule",
+    "Vascular endothelial cells": "endothelial cell of vascular tree",
+    "Intestinal epithelial cells": "intestinal epithelial cell",
+    "Hepatocytes": "hepatocyte",
+    "Fibroblasts": "fibroblast",
+    "Lymphoid cells_T cells": "T cell",
+    "Myeloid cells": "myeloid cell",
+    "Brown adipocytes": "brown fat cell",
+    "Lymphoid cells_B cells": "B cell",
+    "Adipocytes": "fat cell",
+    "Type II alveolar epithelial cells": "type II pneumocyte",
+    "Colonic epithelial cells": "colon epithelial cell",
+    "Mural cells": "mural cell",
+    "Cerebellum granule neurons": "cerebellar neuron",
+    "Goblet cells": "goblet cell",
+    "Vascular endothelial cells_General capillary cells": "endothelial cell of vascular tree",
+    "Ventricular cardiomyocytes": "regular ventricular cardiac myocyte",
+    "Type II myonuclei": "type II muscle cell",
+    "Thick ascending limb of LOH cells": "vasa recta ascending limb cell",
+    "Gastric mucous cells": "mucous cell of stomach",
+    "Distal convoluted tubule cells": "kidney distal convoluted tubule epithelial cell",
+    "Adipoce stem and progenitor cells": "hepatic oval stem cell",
+    "Chief cells": "chief cell of parathyroid gland",
+    "Paneth cells": "paneth cell",
+    "Myeloid cells_Alveolar macrophages": "alveolar macrophage",
+    "Lymphoid cells_Plasma cells": "plasma cell",
+    "Secretory cells": "secretory cell",
+    "Lymphoid cells_Resting B cells": "B cell",
+    "Cortical projection neurons 1": "corticothalamic-projecting glutamatergic cortical neuron",
+    "Endocardial endothelial cells": "endocardial cell",
+    "Type I alveolar epithelial cells": "type I pneumocyte",
+    "Interbrain and midbrain neurons 1": "midbrain dopaminergic neuron",
+    "Interbrain and midbrain neurons 2": "midbrain dopaminergic neuron",
+    "Myeloid cells_Monocytes": "monocyte",
+    "Myeloid cells_Dendritic cells": "myeloid dendritic cell",
+    "Oligodendrocytes": "oligodendrocyte",
+    "Lymphatic endothelial cells": "endothelial cell of lymphatic vessel",
+    "Enteroendocrine cells": "enteroendocrine cell",
+    "Vascular endothelial cells_Aerocytes": "endothelial cell of vascular tree",
+    "Gastric epithelial cells": "epithelial cell of stomach",
+    "Fibro–adipogenic progenitors": "fibro/adipogenic progenitor cell",
+    "Parietal cells": "parietal cell",
+    "Astrocytes": "astrocyte",
+    "Connecting tubule cells": "kidney connecting tubule beta-intercalated cell",
+    "Hepatic stellate cells": "hepatic stellate cell",
+    "Striatal neurons 1": "striatum neuron",
+    "Mesothelial cells": "mesothelial cell",
+    "Lymphoid cells_Cycling B cells": "germinal center B cell",
+    "Type B intercalated cells": "renal beta-intercalated cell",
+    "Type A intercalated cells": "renal alpha-intercalated cell",
+    "Myeloid cells_Neutrophils": "neutrophil",
+    "Principal cells": "renal principal cell",
+    "Cortical projection neurons 2": "corticothalamic-projecting glutamatergic cortical neuron",
+    "Muc2-producing goblet cells": "intestine goblet cell",
+    "OB neurons 1": "olfactory bulb interneuron",
+    "Atrial cardiomyocytes": "regular atrial cardiac myocyte",
+    "Lymphoid cells": "leukocyte",
+    "Skeletal muscle cells": "cell of skeletal muscle",
+    "Neural cells": "neural cell",
+    "Cerebellum interneurons": "cerebellar neuron",
+    "Interneurons 1": "interneuron",
+    "Descending thin limb of LOH cells": "vasa recta descending limb cell",
+    "Tuft cells": "intestinal tuft cell",
+    "Oligodendrocyte progenitor cells": "oligodendrocyte precursor cell",
+    "Enteric glia": "enteroglial cell",
+    "Endothelial cells": "endothelial cell",
+    "Dentate gyrus neurons": "dentate gyrus neuron",
+    "Myeloid cells_Interstitial macrophages": "tissue-resident macrophage",
+    "Ciliated cells": "ciliated cell",
+    "Microglia": "microglial cell",
+    "Interneurons 2": "interneuron",
+    "Ncam1 positive cells": "parafollicular cell",
+    "Rdh16 positive cells": "unknown",
+    "Circulating hepatoblasts": "hepatoblast",
+    "Enteric neurons": "enteric neuron",
+    "Ascending thin limb of LOH cells": "vasa recta ascending limb cell",
+    "Mfge8 positive cells": "unknown",
+    "Cholangiocytes": "cholangiocyte",
+    "Podocytes": "podocyte",
+    "Muscle satellite cells": "skeletal muscle satellite cell",
+    "Purkinje neurons": "Purkinje cell",
+    "Juxtaglomerular cells": "juxtaglomerular complex cell",
+    "Ngf positive cells": "neurogliaform cell",
+    "Bergmann glia": "Bergmann glial cell",
+    "Megf11 positive cells": "unknown",
+    "Myotendinous junction myonuclei": "unknown",
+    "Vascular leptomeningeal cells": "vascular leptomeningeal cell",
+    "Urothelial cells": "urothelial cell",
+    "Tenocytes": "tendon cell",
+    "Myelinating Schwann cells": "myelinating Schwann cell",
+    "Epididymal cells": "epididymis glandular cell",
+    "Muc6-producing goblet cells": "lung goblet cell",
+    "Type I myonuclei": "type I muscle cell",
+    "OB neurons 2": "olfactory bulb interneuron",
+    "Sis positive cells": "unknown",
+    "Lgr5 positive cells": "unknown",
+    "Macula densa cells": "macula densa epithelial cell",
+    "Choroid plexus epithelial cells": "choroid plexus epithelial cell",
+    "Cortical projection neurons 3": "corticothalamic-projecting glutamatergic cortical neuron",
+    "Interstitial cells of Cajal": "interstitial cell of Cajal",
+    "Cacna1b positive cells": "unknown",
+    "Hindbrain neurons 2": "neuron",
+    "Myeloid cells_Basophils": "basophil",
+    "Ependymal cells": "ependymal cell",
+    "Muc5ac-producing goblet cells": "lung goblet cell",
+    "Myeloid cells_Mast cells": "mast cell",
+    "Pulmonary neuroendocrine cells": "lung neuroendocrine cell",
+    "Basal cells": "basal cell",
+    "OB neurons 3": "olfactory bulb interneuron",
+    "Non-myelinating Schwann cells": "non-myelinating Schwann cell",
+    "Asic2 positive cells": "unknown",
+    "Striatal neurons 2": "striatum neuron",
+    "Erythroblasts": "erythroblast",
+    "Hindbrain neurons 1": "neuron",
+    "Neuromuscular junction myonuclei": "unknown",
+    "Habenula neurons": "unknown",
+    "Pituitary cells": "pituitary gland cell",
+    "Unipolar brush cells": "unipolar brush cell",
+    "Pde4c positive cells": "unknown",
+    "Pancreatic acinar cells": "pancreatic acinar cell",
+    "Inferior olivary nucleus neurons": "bushy cell",
+    "Colec10 positive cells": "unknown",
+    "Fcgbp positive cells": "unknown",
+    "Fut9 positive cells": "unknown",
+    "Mirg positive cells": "unknown",
+    "Alox15 positive cells": "unknown",
+    "Osteoblasts": "osteoblast",
+}
+genesdf = utils.load_genes("NCBITaxon:10090")
+{k: v if v =="unknown" else bt.CellType.filter(name=v).one().ontology_id for k, v in celltyper.items()}
+adata.obs["organism_ontology_term_id"] = "NCBITaxon:10090"
+adata.obs["tissue_ontology_term_id"] = adata.obs["Organ_name"].replace(tissuer)
+adata.obs["cell_type_ontology_term_id"] = adata.obs["Main_cell_type"].replace(
+    celltyper
+)
+adata.obs["disease_ontology_term_id"] = "PATO:0000461"
+adata.obs["assay_ontology_term_id"] = "unknown"
+adata.obs["self_reported_ethnicity_ontology_term_id"] = "unknown"
+adata.obs["development_stage_ontology_term_id"] = adata.obs["Age_group"].replace(
+    ager
+)
+adata.obs["sex_ontology_term_id"] = adata.obs["Gender"].replace(sexr)
+for i in range(num_blocks):
+    start_index = i * block_size
+    end_index = min((i + 1) * block_size, len(adata))
+    block = adata[start_index:end_index].to_memory()
+    # process block here
+    block = block[(block.obs["Gene_count"] > 400)]
+    intersect_genes = set(block.var.index).intersection(set(genesdf.index))
+    print(f"Removed {len(block.var.index) - len(intersect_genes)} genes.")
+    block = block[:, list(intersect_genes)]
+    # marking unseen genes
+    unseen = set(genesdf.index) - set(block.var.index)
+    # adding them to adata
+    emptyda = ad.AnnData(
+        csr_matrix((block.shape[0], len(unseen)), dtype=np.float32),
+        var=pd.DataFrame(index=list(unseen)),
+        obs=pd.DataFrame(index=block.obs.index),
+    )
+    block = ad.concat([block, emptyda], axis=1, join="outer", merge="only")
+    # do a validation function
+    block.uns["unseen_genes"] = list(unseen)
+    block = block[:, block.var.sort_index().index]
+    block.var[genesdf.columns] = genesdf.loc[block.var.index]
+    for name in ["stable_id", "created_at", "updated_at"]:
+        if name in block.var.columns:
+            block.var = block.var.drop(columns=name)
+    block.write_h5ad('zhang2024_adata_'+str(i)+".h5ad")
+"""

scdataloader 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl

scdataloader 0.0.2py3-none-any.whl → 0.0.4py3-none-any.whl