PyPI - scdataloader - Versions diffs - 1.0.1__py3-none-any.whl → 1.0.6__py3-none-any.whl - Mend

scdataloader 1.0.1py3-none-any.whl → 1.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

scdataloader/VERSION +1 -1
scdataloader/__init__.py +2 -2
scdataloader/collator.py +6 -66
scdataloader/data.py +42 -5
scdataloader/datamodule.py +1 -1
scdataloader/mapped.py +113 -92
scdataloader/preprocess.py +1 -6
scdataloader/utils.py +75 -85
{scdataloader-1.0.1.dist-info → scdataloader-1.0.6.dist-info}/METADATA +68 -9
scdataloader-1.0.6.dist-info/RECORD +16 -0
scdataloader-1.0.1.dist-info/RECORD +0 -16
{scdataloader-1.0.1.dist-info → scdataloader-1.0.6.dist-info}/LICENSE +0 -0
{scdataloader-1.0.1.dist-info → scdataloader-1.0.6.dist-info}/WHEEL +0 -0
{scdataloader-1.0.1.dist-info → scdataloader-1.0.6.dist-info}/entry_points.txt +0 -0

scdataloader/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 1.0.0
1	+ 1.0.5

scdataloader/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from .data import Dataset
+from .data import Dataset, SimpleAnnDataset
 from .datamodule import DataModule
 from .preprocess import Preprocessor
-from .collator import *
+from .collator import Collator

scdataloader/collator.py CHANGED Viewed

@@ -92,7 +92,10 @@ class Collator:
         )
         for organism in self.organisms:
             ogenedf = self.genedf[self.genedf.organism == organism]
-            tot = self.genedf[self.genedf.index.isin(valid_genes)]
+            if len(valid_genes) > 0:
+                tot = self.genedf[self.genedf.index.isin(valid_genes)]
+            else:
+                tot = self.genedf
             org = org_to_id[organism] if org_to_id is not None else organism
             self.start_idx.update({org: np.where(tot.organism == organism)[0][0]})
             if len(valid_genes) > 0:
@@ -108,7 +111,7 @@ class Collator:
         Args:
             batch (list[dict[str: array]]): List of dicts of arrays containing gene expression data.
                 the first list is for the different samples, the second list is for the different elements with
-                elem["x"]: gene expression
+                elem["X"]: gene expression
                 elem["organism_name"]: organism ontology term id
                 elem["tp_name"]: heat diff
                 elem["class_names.."]: other classes
@@ -132,7 +135,7 @@ class Collator:
                 continue
             if "_storage_idx" in elem:
                 dataset.append(elem["_storage_idx"])
-            expr = np.array(elem["x"])
+            expr = np.array(elem["X"])
             total_count.append(expr.sum())
             if len(self.accepted_genes) > 0:
                 expr = expr[self.accepted_genes[organism_id]]
@@ -231,69 +234,6 @@ class Collator:
         return ret
-class AnnDataCollator(Collator):
-    def __init__(self, *args, **kwargs):
-        """
-        AnnDataCollator Collator to use if working with AnnData's experimental dataloader (it is very slow!!!)
-        Args:
-            @see Collator
-        """
-        super().__init__(*args, **kwargs)
-    def __call__(self, batch) -> dict[str, Tensor]:
-        exprs = []
-        total_count = []
-        other_classes = []
-        gene_locs = []
-        tp = []
-        for elem in batch:
-            organism_id = elem.obs[self.organism_name]
-            if organism_id.item() not in self.organism_ids:
-                print(organism_id)
-            expr = np.array(elem.X[0])
-            total_count.append(expr.sum())
-            if len(self.accepted_genes) > 0:
-                expr = expr[self.accepted_genes[organism_id]]
-            if self.how == "most expr":
-                loc = np.argsort(expr)[-(self.max_len) :][::-1]
-            elif self.how == "random expr":
-                nnz_loc = np.where(expr > 0)[0]
-                loc = nnz_loc[
-                    np.random.choice(len(nnz_loc), self.max_len, replace=False)
-                ]
-            else:
-                raise ValueError("how must be either most expr or random expr")
-            if self.add_zero_genes > 0:
-                zero_loc = np.where(expr == 0)[0]
-                zero_loc = [
-                    np.random.choice(len(zero_loc), self.add_zero_genes, replace=False)
-                ]
-                loc = np.concatenate((loc, zero_loc), axis=None)
-            exprs.append(expr[loc])
-            gene_locs.append(loc + self.start_idx[organism_id.item()])
-            if self.tp_name is not None:
-                tp.append(elem.obs[self.tp_name])
-            else:
-                tp.append(0)
-            other_classes.append([elem.obs[i].values[0] for i in self.class_names])
-        expr = np.array(exprs)
-        tp = np.array(tp)
-        gene_locs = np.array(gene_locs)
-        total_count = np.array(total_count)
-        other_classes = np.array(other_classes)
-        return {
-            "x": Tensor(expr),
-            "genes": Tensor(gene_locs).int(),
-            "depth": Tensor(total_count),
-            "class": Tensor(other_classes),
-        }
 #############
 #### WIP ####
 #############

scdataloader/data.py CHANGED Viewed

@@ -8,7 +8,7 @@ import bionty as bt
 import pandas as pd
 from torch.utils.data import Dataset as torchDataset
 from typing import Union, Optional, Literal
-from scdataloader import mapped
+from scdataloader.mapped import MappedCollection
 import warnings
 from anndata import AnnData
@@ -74,9 +74,9 @@ class Dataset(torchDataset):
     join_vars: Literal["inner", "outer"] | None = None
     def __post_init__(self):
-        self.mapped_dataset = mapped.mapped(
+        self.mapped_dataset = mapped(
             self.lamin_dataset,
-            label_keys=self.obs,
+            obs_keys=self.obs,
             join=self.join_vars,
             encode_labels=self.clss_to_pred,
             unknown_label="unknown",
@@ -311,7 +311,7 @@ class SimpleAnnDataset(torchDataset):
         for idx, obs in enumerate(self.adata.obs.itertuples(index=False)):
             with warnings.catch_warnings():
                 warnings.filterwarnings("ignore", category=DeprecationWarning)
-                out = {"x": self.adataX[idx].reshape(-1)}
+                out = {"X": self.adataX[idx].reshape(-1)}
                 out.update(
                     {name: val for name, val in self.obs_to_output.iloc[idx].items()}
                 )
@@ -320,8 +320,45 @@ class SimpleAnnDataset(torchDataset):
     def __getitem__(self, idx):
         with warnings.catch_warnings():
             warnings.filterwarnings("ignore", category=DeprecationWarning)
-            out = {"x": self.adataX[idx].reshape(-1)}
+            out = {"X": self.adataX[idx].reshape(-1)}
             out.update(
                 {name: val for name, val in self.obs_to_output.iloc[idx].items()}
             )
         return out
+def mapped(
+    dataset,
+    obs_keys: list[str] | None = None,
+    join: Literal["inner", "outer"] | None = "inner",
+    encode_labels: bool | list[str] = True,
+    unknown_label: str | dict[str, str] | None = None,
+    cache_categories: bool = True,
+    parallel: bool = False,
+    dtype: str | None = None,
+    stream: bool = False,
+    is_run_input: bool | None = None,
+) -> MappedCollection:
+    path_list = []
+    for artifact in dataset.artifacts.all():
+        if artifact.suffix not in {".h5ad", ".zrad", ".zarr"}:
+            print(f"Ignoring artifact with suffix {artifact.suffix}")
+            continue
+        elif not artifact.path.exists():
+            print(f"Path does not exist for artifact with suffix {artifact.suffix}")
+            continue
+        elif not stream:
+            path_list.append(artifact.stage())
+        else:
+            path_list.append(artifact.path)
+    ds = MappedCollection(
+        path_list=path_list,
+        obs_keys=obs_keys,
+        join=join,
+        encode_labels=encode_labels,
+        unknown_label=unknown_label,
+        cache_categories=cache_categories,
+        parallel=parallel,
+        dtype=dtype,
+    )
+    return ds

scdataloader/datamodule.py CHANGED Viewed

@@ -130,7 +130,7 @@ class DataModule(L.LightningDataModule):
                 print(f"reduced the size to {len(set(c))/len(biomart)}")
                 biomart["pos"] = c
             mdataset.genedf = biomart.loc[mdataset.genedf.index]
-            self.gene_pos = mdataset.genedf["pos"].tolist()
+            self.gene_pos = mdataset.genedf["pos"].astype(int).tolist()
         if gene_embeddings != "":
             mdataset.genedf = mdataset.genedf.join(

scdataloader/mapped.py CHANGED Viewed

@@ -7,12 +7,12 @@ from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Union
 import numpy as np
 import pandas as pd
-from lamin_utils import logger
 from lamindb_setup.core.upath import UPath
-from lamindb.core._data import _track_run_input
-from lamindb.core.storage._backed_access import (
+from lamindb.core.storage._anndata_accessor import (
+    ArrayType,
     ArrayTypes,
+    GroupType,
     GroupTypes,
     StorageType,
     _safer_read_index,
@@ -47,42 +47,6 @@ class _Connect:
             self.conn.close()
-def mapped(
-    dataset,
-    label_keys: str | list[str] | None = None,
-    join: Literal["inner", "outer"] | None = "inner",
-    encode_labels: bool | list[str] = True,
-    unknown_label: str | dict[str, str] | None = None,
-    cache_categories: bool = True,
-    parallel: bool = False,
-    dtype: str | None = None,
-    stream: bool = False,
-    is_run_input: bool | None = None,
-) -> MappedCollection:
-    path_list = []
-    for artifact in dataset.artifacts.all():
-        if artifact.suffix not in {".h5ad", ".zrad", ".zarr"}:
-            logger.warning(f"Ignoring artifact with suffix {artifact.suffix}")
-            continue
-        elif not stream:
-            path_list.append(artifact.stage())
-        else:
-            path_list.append(artifact.path)
-    ds = MappedCollection(
-        path_list,
-        label_keys,
-        join,
-        encode_labels,
-        unknown_label,
-        cache_categories,
-        parallel,
-        dtype,
-    )
-    # track only if successful
-    _track_run_input(dataset, is_run_input)
-    return ds
 class MappedCollection:
     """Map-style collection for use in data loaders.
@@ -92,6 +56,12 @@ class MappedCollection:
     If your `AnnData` collection is in the cloud, move them into a local cache
     first for faster access.
+    `__getitem__` of the `MappedCollection` object takes a single integer index
+    and returns a dictionary with the observation data sample for this index from
+    the `AnnData` objects in `path_list`. The dictionary has keys for `layers_keys`
+    (`.X` is in `"X"`), `obs_keys`, `obsm_keys` (under `f"obsm_{key}"`) and also `"_store_idx"`
+    for the index of the `AnnData` object containing this observation sample.
     .. note::
         For a guide, see :doc:`docs:scrna5`.
@@ -107,53 +77,71 @@ class MappedCollection:
     Args:
         path_list: A list of paths to `AnnData` objects stored in `.h5ad` or `.zarr` formats.
-        label_keys: Columns of the ``.obs`` slot that store labels.
+        layers_keys: Keys from the ``.layers`` slot. ``layers_keys=None`` or ``"X"`` in the list
+            retrieves ``.X``.
+        obsm_keys: Keys from the ``.obsm`` slots.
+        obs_keys: Keys from the ``.obs`` slots.
         join: `"inner"` or `"outer"` virtual joins. If ``None`` is passed,
             does not join.
         encode_labels: Encode labels into integers.
-            Can be a list with elements from ``label_keys```.
+            Can be a list with elements from ``obs_keys``.
         unknown_label: Encode this label to -1.
-            Can be a dictionary with keys from ``label_keys`` if ``encode_labels=True```
+            Can be a dictionary with keys from ``obs_keys`` if ``encode_labels=True``
             or from ``encode_labels`` if it is a list.
-        cache_categories: Enable caching categories of ``label_keys`` for faster access.
+        cache_categories: Enable caching categories of ``obs_keys`` for faster access.
         parallel: Enable sampling with multiple processes.
-        dtype: Convert numpy arrays from ``.X`` to this dtype on selection.
+        dtype: Convert numpy arrays from ``.X``, ``.layers`` and ``.obsm``
     """
     def __init__(
         self,
         path_list: list[UPathStr],
-        label_keys: str | list[str] | None = None,
-        join: Literal["inner", "outer", "auto"] | None = "inner",
+        layers_keys: str | list[str] | None = None,
+        obs_keys: str | list[str] | None = None,
+        obsm_keys: str | list[str] | None = None,
+        join: Literal["inner", "outer"] | None = "inner",
         encode_labels: bool | list[str] = True,
         unknown_label: str | dict[str, str] | None = None,
         cache_categories: bool = True,
         parallel: bool = False,
         dtype: str | None = None,
     ):
-        assert join in {None, "inner", "outer", "auto"}
+        if join not in {None, "inner", "outer"}:  # pragma: nocover
+            raise ValueError(
+                f"join must be one of None, 'inner, or 'outer' but was {type(join)}"
+            )
-        label_keys = [label_keys] if isinstance(label_keys, str) else label_keys
-        self.label_keys = label_keys
+        if layers_keys is None:
+            self.layers_keys = ["X"]
+        else:
+            self.layers_keys = (
+                [layers_keys] if isinstance(layers_keys, str) else layers_keys
+            )
+        obsm_keys = [obsm_keys] if isinstance(obsm_keys, str) else obsm_keys
+        self.obsm_keys = obsm_keys
+        obs_keys = [obs_keys] if isinstance(obs_keys, str) else obs_keys
+        self.obs_keys = obs_keys
         if isinstance(encode_labels, list):
             if len(encode_labels) == 0:
                 encode_labels = False
-            elif label_keys is None or not all(
-                enc_label in label_keys for enc_label in encode_labels
+            elif obs_keys is None or not all(
+                enc_label in obs_keys for enc_label in encode_labels
             ):
                 raise ValueError(
-                    "All elements of `encode_labels` should be in `label_keys`."
+                    "All elements of `encode_labels` should be in `obs_keys`."
                 )
         else:
             if encode_labels:
-                encode_labels = label_keys if label_keys is not None else False
+                encode_labels = obs_keys if obs_keys is not None else False
         self.encode_labels = encode_labels
         if encode_labels and isinstance(unknown_label, dict):
             if not all(unkey in encode_labels for unkey in unknown_label):  # type: ignore
                 raise ValueError(
-                    "All keys of `unknown_label` should be in `encode_labels` and `label_keys`."
+                    "All keys of `unknown_label` should be in `encode_labels` and `obs_keys`."
                 )
         self.unknown_label = unknown_label
@@ -194,12 +182,16 @@ class MappedCollection:
         self.join_vars = join
         self.var_indices = None
+        self.var_joint = None
+        self.n_vars_list = None
+        self.n_vars = None
         if self.join_vars is not None:
             self._make_join_vars()
+            self.n_vars = len(self.var_joint)
-        if self.label_keys is not None:
+        if self.obs_keys is not None:
             if cache_categories:
-                self._cache_categories(self.label_keys)
+                self._cache_categories(self.obs_keys)
             else:
                 self._cache_cats: dict = {}
             self.encoders: dict = {}
@@ -222,10 +214,10 @@ class MappedCollection:
             self.conns.append(conn)
             self.storages.append(storage)
-    def _cache_categories(self, label_keys: list):
+    def _cache_categories(self, obs_keys: list):
         self._cache_cats = {}
         decode = np.frompyfunc(lambda x: x.decode("utf-8"), 1, 1)
-        for label in label_keys:
+        for label in obs_keys:
             self._cache_cats[label] = []
             for storage in self.storages:
                 with _Connect(storage) as store:
@@ -252,9 +244,12 @@ class MappedCollection:
     def _make_join_vars(self):
         var_list = []
+        self.n_vars_list = []
         for storage in self.storages:
             with _Connect(storage) as store:
-                var_list.append(_safer_read_index(store["var"]))
+                vars = _safer_read_index(store["var"])
+                var_list.append(vars)
+                self.n_vars_list.append(len(vars))
         self.var_joint = None
         vars_eq = all(var_list[0].equals(vrs) for vrs in var_list[1:])
@@ -262,6 +257,7 @@ class MappedCollection:
             self.join_vars = None
             self.var_joint = var_list[0]
             return
         if self.join_vars == "inner":
             self.var_joint = reduce(pd.Index.intersection, var_list)
             if len(self.var_joint) == 0:
@@ -285,6 +281,20 @@ class MappedCollection:
     def __len__(self):
         return self.n_obs
+    @property
+    def shape(self):
+        """Shape of the (virtually aligned) dataset."""
+        return (self.n_obs, self.n_vars)
+    @property
+    def original_shapes(self):
+        """Shapes of the underlying AnnData objects."""
+        if self.n_vars_list is None:
+            n_vars_list = [None] * len(self.n_obs_list)
+        else:
+            n_vars_list = self.n_vars_list
+        return list(zip(self.n_obs_list, n_vars_list))
     def __getitem__(self, idx: int):
         obs_idx = self.indices[idx]
         storage_idx = self.storage_idx[idx]
@@ -294,17 +304,28 @@ class MappedCollection:
             var_idxs_join = None
         with _Connect(self.storages[storage_idx]) as store:
-            out = {"x": self._get_data_idx(store, obs_idx, var_idxs_join)}
-            out["_storage_idx"] = storage_idx
-            if self.label_keys is not None:
-                for label in self.label_keys:
+            out = {}
+            for layers_key in self.layers_keys:
+                lazy_data = (
+                    store["X"] if layers_key == "X" else store["layers"][layers_key]
+                )
+                out[layers_key] = self._get_data_idx(
+                    lazy_data, obs_idx, self.join_vars, var_idxs_join, self.n_vars
+                )
+            if self.obsm_keys is not None:
+                for obsm_key in self.obsm_keys:
+                    lazy_data = store["obsm"][obsm_key]
+                    out[f"obsm_{obsm_key}"] = self._get_data_idx(lazy_data, obs_idx)
+            out["_store_idx"] = storage_idx
+            if self.obs_keys is not None:
+                for label in self.obs_keys:
                     if label in self._cache_cats:
                         cats = self._cache_cats[label][storage_idx]
                         if cats is None:
                             cats = []
                     else:
                         cats = None
-                    label_idx = self._get_label_idx(store, obs_idx, label, cats)
+                    label_idx = self._get_obs_idx(store, obs_idx, label, cats)
                     if label in self.encoders:
                         label_idx = self.encoders[label][label_idx]
                     out[label] = label_idx
@@ -312,46 +333,46 @@ class MappedCollection:
     def _get_data_idx(
         self,
-        storage: StorageType,  # type: ignore
+        lazy_data: ArrayType | GroupType,  # type: ignore
         idx: int,
+        join_vars: Literal["inner", "outer"] | None = None,
         var_idxs_join: list | None = None,
-        layer_key: str | None = None,
+        n_vars_out: int | None = None,
     ):
         """Get the index for the data."""
-        layer = storage["X"] if layer_key is None else storage["layers"][layer_key]  # type: ignore
-        if isinstance(layer, ArrayTypes):  # type: ignore
-            layer_idx = layer[idx]
-            if self.join_vars is None:
-                result = layer_idx
+        if isinstance(lazy_data, ArrayTypes):  # type: ignore
+            lazy_data_idx = lazy_data[idx]  # type: ignore
+            if join_vars is None:
+                result = lazy_data_idx
                 if self._dtype is not None:
                     result = result.astype(self._dtype, copy=False)
-            elif self.join_vars == "outer":
-                dtype = layer_idx.dtype if self._dtype is None else self._dtype
-                result = np.zeros(len(self.var_joint), dtype=dtype)
-                result[var_idxs_join] = layer_idx
+            elif join_vars == "outer":
+                dtype = lazy_data_idx.dtype if self._dtype is None else self._dtype
+                result = np.zeros(n_vars_out, dtype=dtype)
+                result[var_idxs_join] = lazy_data_idx
             else:  # inner join
-                result = layer_idx[var_idxs_join]
+                result = lazy_data_idx[var_idxs_join]
                 if self._dtype is not None:
                     result = result.astype(self._dtype, copy=False)
             return result
         else:  # assume csr_matrix here
-            data = layer["data"]
-            indices = layer["indices"]
-            indptr = layer["indptr"]
+            data = lazy_data["data"]  # type: ignore
+            indices = lazy_data["indices"]  # type: ignore
+            indptr = lazy_data["indptr"]  # type: ignore
             s = slice(*(indptr[idx : idx + 2]))
             data_s = data[s]
             dtype = data_s.dtype if self._dtype is None else self._dtype
-            if self.join_vars == "outer":
-                layer_idx = np.zeros(len(self.var_joint), dtype=dtype)
-                layer_idx[var_idxs_join[indices[s]]] = data_s
+            if join_vars == "outer":
+                lazy_data_idx = np.zeros(n_vars_out, dtype=dtype)
+                lazy_data_idx[var_idxs_join[indices[s]]] = data_s
             else:
-                layer_idx = np.zeros(layer.attrs["shape"][1], dtype=dtype)
-                layer_idx[indices[s]] = data_s
-                if self.join_vars == "inner":
-                    layer_idx = layer_idx[var_idxs_join]
-            return layer_idx
+                lazy_data_idx = np.zeros(lazy_data.attrs["shape"][1], dtype=dtype)  # type: ignore
+                lazy_data_idx[indices[s]] = data_s
+                if join_vars == "inner":
+                    lazy_data_idx = lazy_data_idx[var_idxs_join]
+            return lazy_data_idx
-    def _get_label_idx(
+    def _get_obs_idx(
         self,
         storage: StorageType,
         idx: int,
@@ -379,12 +400,12 @@ class MappedCollection:
             label = label.decode("utf-8")
         return label
-    def get_label_weights(self, label_keys: str | list[str], scaler: int = 10):
+    def get_label_weights(self, obs_keys: str | list[str], scaler: int = 10):
         """Get all weights for the given label keys."""
-        if isinstance(label_keys, str):
-            label_keys = [label_keys]
+        if isinstance(obs_keys, str):
+            obs_keys = [obs_keys]
         labels_list = []
-        for label_key in label_keys:
+        for label_key in obs_keys:
             labels_to_str = self.get_merged_labels(label_key).astype(str).astype("O")
             labels_list.append(labels_to_str)
         if len(labels_list) > 1:

scdataloader/preprocess.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Any, Callable, Optional, Union
+from typing import Callable, Optional, Union
 from uuid import uuid4
 import anndata as ad
@@ -7,9 +7,7 @@ import numpy as np
 import pandas as pd
 import scanpy as sc
 from anndata import AnnData
-from django.db import IntegrityError
 from scipy.sparse import csr_matrix
-import os
 from scdataloader import utils as data_utils
@@ -268,9 +266,6 @@ class Preprocessor:
         # QC
         adata.var[genesdf.columns] = genesdf.loc[adata.var.index]
-        for name in ["stable_id", "created_at", "updated_at"]:
-            if name in adata.var.columns:
-                adata.var = adata.var.drop(columns=name)
         print("startin QC")
         sc.pp.calculate_qc_metrics(
             adata, qc_vars=["mt", "ribo", "hb"], inplace=True, percent_top=[20]

scdataloader/utils.py CHANGED Viewed

@@ -138,7 +138,6 @@ def getBiomartTable(
         res = _fetchFromServer(ensemble_server, attr + attributes, database=database)
         res.to_csv(cachefile, index=False)
     res.columns = attr + attributes
     if type(res) is not type(pd.DataFrame()):
         raise ValueError("should be a dataframe")
@@ -355,13 +354,12 @@ def load_dataset_local(
 def load_genes(organisms: Union[str, list] = "NCBITaxon:9606"):  # "NCBITaxon:10090",
     organismdf = []
-    if type(organisms) == str:
+    if type(organisms) is str:
         organisms = [organisms]
     for organism in organisms:
         genesdf = bt.Gene.filter(
             organism_id=bt.Organism.filter(ontology_id=organism).first().id
         ).df()
-        genesdf = genesdf[~genesdf["public_source_id"].isna()]
         genesdf = genesdf.drop_duplicates(subset="ensembl_gene_id")
         genesdf = genesdf.set_index("ensembl_gene_id").sort_index()
         # mitochondrial genes
@@ -372,7 +370,12 @@ def load_genes(organisms: Union[str, list] = "NCBITaxon:9606"):  # "NCBITaxon:10
         genesdf["hb"] = genesdf.symbol.astype(str).str.contains(("^HB[^(P)]"))
         genesdf["organism"] = organism
         organismdf.append(genesdf)
-    return pd.concat(organismdf)
+    organismdf = pd.concat(organismdf)
+    organismdf.drop(
+        columns=["source_id", "run_id", "created_by_id", "updated_at", "stable_id"],
+        inplace=True,
+    )
+    return organismdf
 def populate_my_ontology(
@@ -409,75 +412,82 @@ def populate_my_ontology(
         diseases (list, optional): List of diseases. Defaults to [].
         dev_stages (list, optional): List of developmental stages. Defaults to [].
     """
-    names = bt.CellType.public().df().index if not celltypes else celltypes
-    records = bt.CellType.from_values(names, field="ontology_id")
-    ln.save(records, parents=bool(celltypes))
-    bt.CellType(name="unknown", ontology_id="unknown").save()
+    # cell type
+    if celltypes is not None:
+        names = bt.CellType.public().df().index if not celltypes else celltypes
+        records = bt.CellType.from_values(names, field="ontology_id")
+        ln.save(records)
+        bt.CellType(name="unknown", ontology_id="unknown").save()
     # Organism
-    names = bt.Organism.public().df().index if not organisms else organisms
-    records = [
-        i[0] if type(i) is list else i
-        for i in [bt.Organism.from_public(ontology_id=i) for i in names]
-    ]
-    ln.save(records, parents=bool(organisms))
-    bt.Organism(name="unknown", ontology_id="unknown").save()
+    if organisms is not None:
+        names = bt.Organism.public().df().index if not organisms else organisms
+        records = [
+            i[0] if type(i) is list else i
+            for i in [bt.Organism.from_source(ontology_id=i) for i in names]
+        ]
+        ln.save(records)
+        bt.Organism(name="unknown", ontology_id="unknown").save()
+        organism_names = names
     # Phenotype
-    names = bt.Phenotype.public().df().index if not sex else sex
-    records = [
-        bt.Phenotype.from_public(
-            ontology_id=i,
-            public_source=bt.PublicSource.filter(
-                entity="Phenotype", source="pato"
-            ).one(),
-        )
-        for i in names
-    ]
-    ln.save(records, parents=bool(sex))
-    bt.Phenotype(name="unknown", ontology_id="unknown").save()
+    if sex is not None:
+        names = bt.Phenotype.public().df().index if not sex else sex
+        records = [
+            bt.Phenotype.from_source(
+                ontology_id=i, source=bt.PublicSource.filter(name="pato").first()
+            )
+            for i in names
+        ]
+        ln.save(records)
+        bt.Phenotype(name="unknown", ontology_id="unknown").save()
     # ethnicity
-    names = bt.Ethnicity.public().df().index if not ethnicities else ethnicities
-    records = bt.Ethnicity.from_values(names, field="ontology_id")
-    ln.save(records, parents=bool(ethnicities))
-    bt.Ethnicity(
-        name="unknown", ontology_id="unknown"
-    ).save()  # multi ethnic will have to get renamed
+    if ethnicities is not None:
+        names = bt.Ethnicity.public().df().index if not ethnicities else ethnicities
+        records = bt.Ethnicity.from_values(names, field="ontology_id")
+        ln.save(records)
+        bt.Ethnicity(
+            name="unknown", ontology_id="unknown"
+        ).save()  # multi ethnic will have to get renamed
     # ExperimentalFactor
-    names = bt.ExperimentalFactor.public().df().index if not assays else assays
-    records = bt.ExperimentalFactor.from_values(names, field="ontology_id")
-    ln.save(records, parents=bool(assays))
-    bt.ExperimentalFactor(name="unknown", ontology_id="unknown").save()
-    # lookup = bt.ExperimentalFactor.lookup()
-    # lookup.smart_seq_v4.parents.add(lookup.smart_like)
+    if assays is not None:
+        names = bt.ExperimentalFactor.public().df().index if not assays else assays
+        records = bt.ExperimentalFactor.from_values(names, field="ontology_id")
+        ln.save(records)
+        bt.ExperimentalFactor(name="unknown", ontology_id="unknown").save()
+        # lookup = bt.ExperimentalFactor.lookup()
+        # lookup.smart_seq_v4.parents.add(lookup.smart_like)
     # Tissue
-    names = bt.Tissue.public().df().index if not tissues else tissues
-    records = bt.Tissue.from_values(names, field="ontology_id")
-    ln.save(records, parents=bool(tissues))
-    bt.Tissue(name="unknown", ontology_id="unknown").save()
+    if tissues is not None:
+        names = bt.Tissue.public().df().index if not tissues else tissues
+        records = bt.Tissue.from_values(names, field="ontology_id")
+        ln.save(records)
+        bt.Tissue(name="unknown", ontology_id="unknown").save()
     # DevelopmentalStage
-    names = bt.DevelopmentalStage.public().df().index if not dev_stages else dev_stages
-    records = bt.DevelopmentalStage.from_values(names, field="ontology_id")
-    ln.save(records, parents=bool(dev_stages))
-    bt.DevelopmentalStage(name="unknown", ontology_id="unknown").save()
-    names = bt.DevelopmentalStage.public(organism="mouse").df().name
-    bionty_source = bt.PublicSource.filter(
-        entity="DevelopmentalStage", organism="mouse"
-    ).one()
-    records = [
-        bt.DevelopmentalStage.from_public(name=i, public_source=bionty_source)
-        for i in names.tolist()
-    ]
-    records[-4] = records[-4][0]
-    ln.save(records)
+    if dev_stages is not None:
+        names = (
+            bt.DevelopmentalStage.public().df().index if not dev_stages else dev_stages
+        )
+        records = bt.DevelopmentalStage.from_values(names, field="ontology_id")
+        ln.save(records)
+        bt.DevelopmentalStage(name="unknown", ontology_id="unknown").save()
+        names = bt.DevelopmentalStage.public(organism="mouse").df().index
+        records = [
+            bt.DevelopmentalStage.from_source(
+                ontology_id=i,
+                source=bt.PublicSource.filter(organism="mouse", name="mmusdv").first(),
+            )
+            for i in names.tolist()
+        ]
+        ln.save(records)
     # Disease
-    names = bt.Disease.public().df().index if not diseases else diseases
-    records = bt.Disease.from_values(names, field="ontology_id")
-    ln.save(records, parents=bool(diseases))
-    bt.Disease(name="normal", ontology_id="PATO:0000461").save()
-    bt.Disease(name="unknown", ontology_id="unknown").save()
+    if diseases is not None:
+        names = bt.Disease.public().df().index if not diseases else diseases
+        records = bt.Disease.from_values(names, field="ontology_id")
+        ln.save(records)
+        bt.Disease(name="normal", ontology_id="PATO:0000461").save()
+        bt.Disease(name="unknown", ontology_id="unknown").save()
     # genes
-    for organism in ["NCBITaxon:10090", "NCBITaxon:9606"]:
+    for organism in organism_names:
         # convert onto to name
         organism = bt.Organism.filter(ontology_id=organism).one().name
         names = bt.Gene.public(organism=organism).df()["ensembl_gene_id"]
@@ -523,26 +533,6 @@ def length_normalize(adata: AnnData, gene_lengths: list):
     return adata
-def pd_load_cached(url: str, loc: str = "/tmp/", cache: bool = True, **kwargs):
-    """
-    pd_load_cached downloads a file from a url and loads it as a pandas dataframe
-    Args:
-        url (str): the url to download the file from
-        loc (str, optional): the location to save the file to. Defaults to "/tmp/".
-        cache (bool, optional): whether to use the cached file or not. Defaults to True.
-    Returns:
-        pd.DataFrame: the dataframe
-    """
-    # Check if the file exists, if not, download it
-    loc += url.split("/")[-1]
-    if not os.path.isfile(loc) or not cache:
-        urllib.request.urlretrieve(url, loc)
-    # Load the data from the file
-    return pd.read_csv(loc, **kwargs)
 def translate(
     val: Union[str, list, set, Counter, dict], t: str = "cell_type_ontology_term_id"
 ):

{scdataloader-1.0.1.dist-info → scdataloader-1.0.6.dist-info}/METADATA RENAMED Viewed

@@ -1,28 +1,37 @@
 Metadata-Version: 2.1
 Name: scdataloader
-Version: 1.0.1
+Version: 1.0.6
 Summary: a dataloader for single cell data in lamindb
 Home-page: https://github.com/jkobject/scDataLoader
 License: GPL3
-Keywords: scRNAseq,dataloader,pytorch,lamindb,scPrint
+Keywords: scRNAseq,dataloader,pytorch,lamindb,scPRINT
 Author: jkobject
 Requires-Python: ==3.10.*
 Classifier: License :: Other/Proprietary License
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.10
+Provides-Extra: dev
 Requires-Dist: anndata
 Requires-Dist: biomart
-Requires-Dist: bionty
+Requires-Dist: bionty (==0.48.0)
+Requires-Dist: black (>=23.10.1,<24.0.0) ; extra == "dev"
 Requires-Dist: cellxgene-census
+Requires-Dist: coverage (>=7.3.2,<8.0.0) ; extra == "dev"
 Requires-Dist: decoupler
 Requires-Dist: django
+Requires-Dist: flake8 (>=6.1.0,<7.0.0) ; extra == "dev"
+Requires-Dist: gitchangelog (>=3.0.4,<4.0.0) ; extra == "dev"
 Requires-Dist: ipykernel
-Requires-Dist: lamindb
+Requires-Dist: isort (>=5.12.0,<6.0.0) ; extra == "dev"
+Requires-Dist: lamindb (==0.75.1)
 Requires-Dist: leidenalg
 Requires-Dist: lightning
-Requires-Dist: lnschema-bionty
 Requires-Dist: matplotlib
+Requires-Dist: mkdocs (>=1.5.3,<2.0.0) ; extra == "dev"
+Requires-Dist: mypy (>=1.6.1,<2.0.0) ; extra == "dev"
 Requires-Dist: pandas (>=2.0.0)
+Requires-Dist: pytest (>=7.4.3,<8.0.0) ; extra == "dev"
+Requires-Dist: pytest-cov (>=4.1.0,<5.0.0) ; extra == "dev"
 Requires-Dist: scikit-misc
 Requires-Dist: seaborn
 Requires-Dist: torch
@@ -61,6 +70,8 @@ It allows you to:
 built on top of `lamindb` and the `.mapped()` function by Sergey: https://github.com/Koncopd
+The package has been designed together with the [scPRINT paper](https://doi.org/10.1101/2024.07.29.605556) and [model](https://github.com/cantinilab/scPRINT).
 ## More
 I needed to create this Data Loader for my PhD project. I am using it to load & preprocess thousands of datasets containing millions of cells in a few seconds. I believed that individuals employing AI for single-cell RNA sequencing and other sequencing datasets would eagerly utilize and desire such a tool, which presently does not exist.
@@ -71,16 +82,42 @@ I needed to create this Data Loader for my PhD project. I am using it to load &
 ```bash
 pip install scdataloader
+# or
+pip install scDataLoader[dev] # for dev dependencies
+lamin login <email> --key <API-key>
+lamin init --storage [folder-name-where-lamin-data-will-be-stored] --schema bionty
 ```
-### Install it locally and run the notebooks:
+if you start with lamin and had to do a `lamin init`, you will also need to populate your ontologies. This is because scPRINT is using ontologies to define its cell types, diseases, sexes, ethnicities, etc.
+you can do it manually or with our function:
+```python
+from scdataloader.utils import populate_my_ontology
+populate_my_ontology() #to populate everything (recommended) (can take 2-10mns)
+populate_my_ontology( #the minimum for scprint to run some inferences (denoising, grn inference)
+organisms: List[str] = ["NCBITaxon:10090", "NCBITaxon:9606"],
+    sex: List[str] = ["PATO:0000384", "PATO:0000383"],
+    celltypes = None,
+    ethnicities = None,
+    assays = None,
+    tissues = None,
+    diseases = None,
+    dev_stages = None,
+)
+```
+### Dev install
+If you want to use the latest version of scDataLoader and work on the code yourself use `git clone` and `pip -e` instead of `pip install`.
 ```bash
 git clone https://github.com/jkobject/scDataLoader.git
-cd scDataLoader
-poetry install
+pip install -e scDataLoader[dev]
 ```
-then run the notebooks with the poetry installed environment
 ## Usage
@@ -147,6 +184,27 @@ The main way to use
 > please refer to the [scPRINT documentation](https://www.jkobject.com/scPRINT/) and [lightning documentation](https://lightning.ai/docs/pytorch/stable/cli/lightning_cli_intermediate.html) for more information on command line usage
+## FAQ
+### how to update my ontologies?
+```bash
+import bionty as bt
+bt.reset_sources()
+# Run via CLI: lamin load <your instance>
+import lnschema_bionty as lb
+lb.dev.sync_bionty_source_to_latest()
+```
+### how to load all ontologies?
+```python
+from scdataloader import utils
+utils.populate_ontologies() # this might take from 5-20mins
+```
 ## Development
 Read the [CONTRIBUTING.md](CONTRIBUTING.md) file.
@@ -163,6 +221,7 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file
 - [scprint](https://www.jkobject.com/scPRINT/)
 Awesome single cell dataloader created by @jkobject
                     GNU GENERAL PUBLIC LICENSE
                        Version 3, 29 June 2007

scdataloader-1.0.6.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,16 @@
+scdataloader/VERSION,sha256=jFS_q38a6b0acUjq5B57Co9K03JuDKxw-COi1F255gw,6
+scdataloader/__init__.py,sha256=lbO3lGiXXgirB07KXj1Fu0BzL7T43VmitqJBTyfSz7M,147
+scdataloader/__main__.py,sha256=db_upDq3tNEtcDH17mPIczToAqGkSKfLy0Qbj6B4YmE,6385
+scdataloader/base.py,sha256=M1gD59OffRdLOgS1vHKygOomUoAMuzjpRtAfM3SBKF8,338
+scdataloader/collator.py,sha256=O5VK2asIfFIQc-Ozm55Bc-OORIlPj_yOt7qn6xqXd74,11292
+scdataloader/config.py,sha256=rrW2DZxG4J2_pmpDbXXsaKJkpNC57w5dIlItiFbANYw,2905
+scdataloader/data.py,sha256=3a9jUhREIzbxC797COGNSn6QqbRiiC30FzxXCoYsTNo,13773
+scdataloader/datamodule.py,sha256=JZq8g274ce3ARW59qwg5GKAt2SzOTaMPGh3CySGQS70,16893
+scdataloader/mapped.py,sha256=s_Fg-lwaXjHFyQcKnp9El2IceMoaEajynyUgOnpVnXQ,20750
+scdataloader/preprocess.py,sha256=9dgsq7c5jD2l-CUGfwC2uG98MCIgnrYFkqknqAyu5dU,28841
+scdataloader/utils.py,sha256=8YIVpqJzNKkIIpAS5p01gyt57X2CrfaMEsC1EJs-q_A,21451
+scdataloader-1.0.6.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
+scdataloader-1.0.6.dist-info/METADATA,sha256=LhqbssiiI5Y-GZeYk1nXnHFNo2SNhuc6W9LwlIX_OCo,43336
+scdataloader-1.0.6.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
+scdataloader-1.0.6.dist-info/entry_points.txt,sha256=nLqucZaa5wiF7-1FCgMXO916WDQ9Qm0TcxQp0f1DwE4,59
+scdataloader-1.0.6.dist-info/RECORD,,

scdataloader-1.0.1.dist-info/RECORD DELETED Viewed

@@ -1,16 +0,0 @@
-scdataloader/VERSION,sha256=WYVJhIUxBN9cNT4vaBoV_HkkdC-aLkaMKa8kjc5FzgM,6
-scdataloader/__init__.py,sha256=NIlE4oTUPRZ3uSW_maozoEHp470I7PV1vMOJ4XpSmL4,122
-scdataloader/__main__.py,sha256=db_upDq3tNEtcDH17mPIczToAqGkSKfLy0Qbj6B4YmE,6385
-scdataloader/base.py,sha256=M1gD59OffRdLOgS1vHKygOomUoAMuzjpRtAfM3SBKF8,338
-scdataloader/collator.py,sha256=zkFdxirTDub1dJ1OJXO0p48kvd2r2ncKMdevAKIdTTc,13447
-scdataloader/config.py,sha256=rrW2DZxG4J2_pmpDbXXsaKJkpNC57w5dIlItiFbANYw,2905
-scdataloader/data.py,sha256=VugtHo9T9PqoJSv3lkJJAB89KD-fRwdVw1D76gnCc9c,12584
-scdataloader/datamodule.py,sha256=WLEWcDMcC1G3VD5tORfhfqRRHcTscpI0EzPikg3udbI,16881
-scdataloader/mapped.py,sha256=yF9l3obuRWbQjW8QZGRSKhc50fizXTWf3Pe1m542fW8,19481
-scdataloader/preprocess.py,sha256=noynYWuy9clhFu9UnN-vSvAHJHwakDttkI5aj1e_T98,29055
-scdataloader/utils.py,sha256=xyDsWaqkjhzlVBP8FiYdBUWHsel3twcVWmI53PhKqTM,21888
-scdataloader-1.0.1.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
-scdataloader-1.0.1.dist-info/METADATA,sha256=2Xd8M1dq_JmvmFjmrrzn-1U4eOtwU6L51Y_7MCkGxvY,41327
-scdataloader-1.0.1.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
-scdataloader-1.0.1.dist-info/entry_points.txt,sha256=nLqucZaa5wiF7-1FCgMXO916WDQ9Qm0TcxQp0f1DwE4,59
-scdataloader-1.0.1.dist-info/RECORD,,

{scdataloader-1.0.1.dist-info → scdataloader-1.0.6.dist-info}/LICENSE RENAMED Viewed

File without changes

{scdataloader-1.0.1.dist-info → scdataloader-1.0.6.dist-info}/WHEEL RENAMED Viewed

File without changes

{scdataloader-1.0.1.dist-info → scdataloader-1.0.6.dist-info}/entry_points.txt RENAMED Viewed

File without changes

scdataloader 1.0.1__py3-none-any.whl → 1.0.6__py3-none-any.whl

scdataloader 1.0.1py3-none-any.whl → 1.0.6py3-none-any.whl