PyPI - scdataloader - Versions diffs - 2.0.0__py3-none-any.whl → 2.0.3__py3-none-any.whl - Mend

scdataloader 2.0.0py3-none-any.whl → 2.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

scdataloader/__main__.py +4 -5
scdataloader/collator.py +65 -56
scdataloader/data.py +38 -54
scdataloader/datamodule.py +139 -86
scdataloader/mapped.py +27 -25
scdataloader/preprocess.py +31 -16
scdataloader/utils.py +120 -20
{scdataloader-2.0.0.dist-info → scdataloader-2.0.3.dist-info}/METADATA +5 -5
scdataloader-2.0.3.dist-info/RECORD +16 -0
{scdataloader-2.0.0.dist-info → scdataloader-2.0.3.dist-info}/WHEEL +1 -1
scdataloader-2.0.0.dist-info/RECORD +0 -16
{scdataloader-2.0.0.dist-info → scdataloader-2.0.3.dist-info}/entry_points.txt +0 -0
{scdataloader-2.0.0.dist-info → scdataloader-2.0.3.dist-info}/licenses/LICENSE +0 -0

scdataloader/__main__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import argparse
-from typing import Optional, Union
+from typing import List, Optional, Union
 import lamindb as ln
@@ -149,7 +149,7 @@ def main():
     )
     preprocess_parser.add_argument(
         "--batch_keys",
-        type=list[str],
+        type=List[str],
         default=[
             "assay_ontology_term_id",
             "self_reported_ethnicity_ontology_term_id",
@@ -229,11 +229,11 @@ def main():
     if args.instance is not None:
         collection = (
             ln.Collection.using(instance=args.instance)
-            .filter(name=args.name, version=args.version)
+            .filter(key=args.name, version=args.version)
             .first()
         )
     else:
-        collection = ln.Collection.filter(name=args.name, version=args.version).first()
+        collection = ln.Collection.filter(key=args.name, version=args.version).first()
     print(
         "using the dataset ", collection, " of size ", len(collection.artifacts.all())
@@ -262,7 +262,6 @@ def main():
         additional_preprocess=additional_preprocess,
         additional_postprocess=additional_postprocess,
         keep_files=False,
-        force_preloaded=args.force_preloaded,
     )
     # Preprocess the dataset

scdataloader/collator.py CHANGED Viewed

@@ -1,18 +1,20 @@
-from typing import Optional
+from typing import List, Optional
 import numpy as np
+import pandas as pd
 from torch import Tensor, long
+from .preprocess import _digitize
 from .utils import load_genes
 class Collator:
     def __init__(
         self,
-        organisms: list[str],
+        organisms: List[str],
         how: str = "all",
         org_to_id: dict[str, int] = None,
-        valid_genes: list[str] = [],
+        valid_genes: Optional[List[str]] = None,
         max_len: int = 2000,
         add_zero_genes: int = 0,
         logp1: bool = False,
@@ -20,8 +22,9 @@ class Collator:
         n_bins: int = 0,
         tp_name: Optional[str] = None,
         organism_name: str = "organism_ontology_term_id",
-        class_names: list[str] = [],
-        genelist: list[str] = [],
+        class_names: List[str] = [],
+        genelist: List[str] = [],
+        genedf: Optional[pd.DataFrame] = None,
     ):
         """
         This class is responsible for collating data for the scPRINT model. It handles the
@@ -71,21 +74,22 @@ class Collator:
         self.start_idx = {}
         self.accepted_genes = {}
         self.to_subset = {}
-        self._setup(None, org_to_id, valid_genes, genelist)
+        self._setup(genedf, org_to_id, valid_genes, genelist)
     def _setup(self, genedf=None, org_to_id=None, valid_genes=[], genelist=[]):
         if genedf is None:
             genedf = load_genes(self.organisms)
+            self.organism_ids = (
+                set([org_to_id[k] for k in self.organisms])
+                if org_to_id is not None
+                else set(self.organisms)
+            )
         self.org_to_id = org_to_id
         self.to_subset = {}
         self.accepted_genes = {}
         self.start_idx = {}
-        self.organism_ids = (
-            set([org_to_id[k] for k in self.organisms])
-            if org_to_id is not None
-            else set(self.organisms)
-        )
-        if len(valid_genes) > 0:
+        if valid_genes is not None:
             if len(set(valid_genes) - set(genedf.index)) > 0:
                 print("Some valid genes are not in the genedf!!!")
             tot = genedf[genedf.index.isin(valid_genes)]
@@ -96,7 +100,7 @@ class Collator:
             self.start_idx.update({org: np.where(tot.organism == organism)[0][0]})
             ogenedf = genedf[genedf.organism == organism]
-            if len(valid_genes) > 0:
+            if valid_genes is not None:
                 self.accepted_genes.update({org: ogenedf.index.isin(valid_genes)})
             if len(genelist) > 0:
                 df = ogenedf[ogenedf.index.isin(valid_genes)]
@@ -107,7 +111,7 @@ class Collator:
         __call__ applies the collator to a minibatch of data
         Args:
-            batch (list[dict[str: array]]): List of dicts of arrays containing gene expression data.
+            batch (List[dict[str: array]]): List of dicts of arrays containing gene expression data.
                 the first list is for the different samples, the second list is for the different elements with
                 elem["X"]: gene expression
                 elem["organism_name"]: organism ontology term id
@@ -115,7 +119,7 @@ class Collator:
                 elem["class_names.."]: other classes
         Returns:
-            list[Tensor]: List of tensors containing the collated data.
+            List[Tensor]: List of tensors containing the collated data.
         """
         # do count selection
         # get the unseen info and don't add any unseen
@@ -129,6 +133,7 @@ class Collator:
         nnz_loc = []
         is_meta = []
         knn_cells = []
+        knn_cells_info = []
         for elem in batch:
             organism_id = elem[self.organism_name]
             if organism_id not in self.organism_ids:
@@ -184,7 +189,14 @@ class Collator:
                 if "knn_cells" in elem:
                     # we complete with genes expressed in the knn
                     # which is not a zero_loc in this context
-                    zero_loc = np.argsort(elem["knn_cells"].sum(0))[-ma:][::-1]
+                    knn_expr = elem["knn_cells"].sum(0)
+                    mask = np.ones(len(knn_expr), dtype=bool)
+                    mask[loc] = False
+                    available_indices = np.where(mask)[0]
+                    available_knn_expr = knn_expr[available_indices]
+                    sorted_indices = np.argsort(available_knn_expr)[::-1]
+                    selected = min(ma, len(available_indices))
+                    zero_loc = available_indices[sorted_indices[:selected]]
                 else:
                     zero_loc = np.where(expr == 0)[0]
                     zero_loc = zero_loc[
@@ -208,6 +220,8 @@ class Collator:
             exprs.append(expr)
             if "knn_cells" in elem:
                 knn_cells.append(elem["knn_cells"])
+            if "knn_cells_info" in elem:
+                knn_cells_info.append(elem["knn_cells_info"])
             # then we need to add the start_idx to the loc to give it the correct index
             # according to the model
             gene_locs.append(loc + self.start_idx[organism_id])
@@ -227,15 +241,46 @@ class Collator:
         dataset = np.array(dataset)
         is_meta = np.array(is_meta)
         knn_cells = np.array(knn_cells)
+        knn_cells_info = np.array(knn_cells_info)
         # normalize counts
         if self.norm_to is not None:
             expr = (expr * self.norm_to) / total_count[:, None]
+            # TODO: solve issue here
+            knn_cells = (knn_cells * self.norm_to) / total_count[:, None]
         if self.logp1:
             expr = np.log2(1 + expr)
+            knn_cells = np.log2(1 + knn_cells)
         # do binning of counts
-        if self.n_bins:
-            pass
+        if self.n_bins > 0:
+            binned_rows = []
+            bin_edges = []
+            for row in expr:
+                if row.max() == 0:
+                    print(
+                        "The input data contains all zero rows. Please make sure "
+                        "this is expected. You can use the `filter_cell_by_counts` "
+                        "arg to filter out all zero rows."
+                    )
+                    binned_rows.append(np.zeros_like(row, dtype=np.int64))
+                    bin_edges.append(np.array([0] * self.n_bins))
+                    continue
+                non_zero_ids = row.nonzero()
+                non_zero_row = row[non_zero_ids]
+                bins = np.quantile(non_zero_row, np.linspace(0, 1, self.n_bins - 1))
+                # bins = np.sort(np.unique(bins))
+                # NOTE: comment this line for now, since this will make the each category
+                # has different relative meaning across datasets
+                non_zero_digits = _digitize(non_zero_row, bins)
+                assert non_zero_digits.min() >= 1
+                assert non_zero_digits.max() <= self.n_bins - 1
+                binned_row = np.zeros_like(row, dtype=np.int64)
+                binned_row[non_zero_ids] = non_zero_digits
+                binned_rows.append(binned_row)
+                bin_edges.append(np.concatenate([[0], bins]))
+            expr = np.stack(binned_rows)
+            # expr = np.digitize(expr, bins=self.bins)
         ret = {
             "x": Tensor(expr),
@@ -248,44 +293,8 @@ class Collator:
             ret.update({"is_meta": Tensor(is_meta).int()})
         if len(knn_cells) > 0:
             ret.update({"knn_cells": Tensor(knn_cells)})
+        if len(knn_cells_info) > 0:
+            ret.update({"knn_cells_info": Tensor(knn_cells_info)})
         if len(dataset) > 0:
             ret.update({"dataset": Tensor(dataset).to(long)})
         return ret
-#############
-#### WIP ####
-#############
-class GeneformerCollator(Collator):
-    def __init__(self, *args, gene_norm_list: list, **kwargs):
-        """
-        GeneformerCollator to finish
-        Args:
-            gene_norm_list (list): the normalization of expression through all datasets, per gene.
-        """
-        super().__init__(*args, **kwargs)
-        self.gene_norm_list = gene_norm_list
-    def __call__(self, batch):
-        super().__call__(batch)
-        # normlization per gene
-        # tokenize the empty locations
-class scGPTCollator(Collator):
-    """
-    scGPTCollator to finish
-    """
-    def __call__(self, batch):
-        super().__call__(batch)
-        # binning
-        # tokenize the empty locations
-class scPRINTCollator(Collator):
-    def __call__(self, batch):
-        super().__call__(batch)

scdataloader/data.py CHANGED Viewed

@@ -2,7 +2,7 @@ import warnings
 from collections import Counter
 from dataclasses import dataclass, field
 from functools import reduce
-from typing import Literal, Optional, Union
+from typing import List, Literal, Optional, Union
 # ln.connect("scprint")
 import bionty as bt
@@ -38,8 +38,8 @@ class Dataset(torchDataset):
     ----
         lamin_dataset (lamindb.Dataset): lamin dataset to load
         genedf (pd.Dataframe): dataframe containing the genes to load
-        obs (list[str]): list of observations to load from the Collection
-        clss_to_predict (list[str]): list of observations to encode
+        obs (List[str]): list of observations to load from the Collection
+        clss_to_predict (List[str]): list of observations to encode
         join_vars (flag): join variables @see :meth:`~lamindb.Dataset.mapped`.
         hierarchical_clss: list of observations to map to a hierarchy using lamin's bionty
         metacell_mode (float, optional): The mode to use for metacell sampling. Defaults to 0.0.
@@ -51,9 +51,9 @@ class Dataset(torchDataset):
     lamin_dataset: ln.Collection
     genedf: Optional[pd.DataFrame] = None
     # set of obs to prepare for prediction (encode)
-    clss_to_predict: Optional[list[str]] = field(default_factory=list)
+    clss_to_predict: Optional[List[str]] = field(default_factory=list)
     # set of obs that need to be hierarchically prepared
-    hierarchical_clss: Optional[list[str]] = field(default_factory=list)
+    hierarchical_clss: Optional[List[str]] = field(default_factory=list)
     join_vars: Literal["inner", "outer"] | None = None
     metacell_mode: float = 0.0
     get_knn_cells: bool = False
@@ -61,6 +61,7 @@ class Dataset(torchDataset):
     force_recompute_indices: bool = False
     def __post_init__(self):
+        # see at the end of the file for the mapped function
         self.mapped_dataset = mapped(
             self.lamin_dataset,
             obs_keys=list(set(self.hierarchical_clss + self.clss_to_predict)),
@@ -102,10 +103,10 @@ class Dataset(torchDataset):
                     "need 'organism_ontology_term_id' in the set of classes if you don't provide a genedf"
                 )
             self.organisms = list(self.class_topred["organism_ontology_term_id"])
-            self.organisms.sort()
             self.genedf = load_genes(self.organisms)
         else:
-            self.organisms = None
+            self.organisms = self.genedf["organism"].unique().tolist()
+        self.organisms.sort()
         self.genedf.columns = self.genedf.columns.astype(str)
         # self.check_aligned_vars()
@@ -160,13 +161,11 @@ class Dataset(torchDataset):
             + "     {} metacell_mode\n".format(self.metacell_mode)
         )
-    def get_label_weights(
+    def get_label_cats(
         self,
-        obs_keys: str | list[str],
-        scaler: int = 10,
-        return_categories=False,
+        obs_keys: Union[str, List[str]],
     ):
-        """Get all weights for the given label keys."""
+        """Get all categories for the given label keys."""
         if isinstance(obs_keys, str):
             obs_keys = [obs_keys]
         labels = None
@@ -176,18 +175,7 @@ class Dataset(torchDataset):
                 labels = labels_to_str
             else:
                 labels = concat_categorical_codes([labels, labels_to_str])
-        counter = Counter(labels.codes)  # type: ignore
-        if return_categories:
-            counter = np.array(list(counter.values()))
-            weights = scaler / (counter + scaler)
-            return weights, np.array(labels.codes)
-        else:
-            counts = np.array([counter[label] for label in labels.codes])
-            if scaler is None:
-                weights = 1.0 / counts
-            else:
-                weights = scaler / (counts + scaler)
-            return weights
+        return np.array(labels.codes)
     def get_unseen_mapped_dataset_elements(self, idx: int):
         """
@@ -197,16 +185,16 @@ class Dataset(torchDataset):
             idx (int): index of the element to get
         Returns:
-            list[str]: list of unseen genes
+            List[str]: list of unseen genes
         """
         return [str(i)[2:-1] for i in self.mapped_dataset.uns(idx, "unseen_genes")]
-    def define_hierarchies(self, clsses: list[str]):
+    def define_hierarchies(self, clsses: List[str]):
         """
         define_hierarchies is a method to define the hierarchies for the classes to predict
         Args:
-            clsses (list[str]): list of classes to predict
+            clsses (List[str]): list of classes to predict
         Raises:
             ValueError: if the class is not in the accepted classes
@@ -233,19 +221,19 @@ class Dataset(torchDataset):
             elif clss == "cell_type_ontology_term_id":
                 parentdf = (
                     bt.CellType.filter()
-                    .df(include=["parents__ontology_id"])
+                    .df(include=["parents__ontology_id", "ontology_id"])
                     .set_index("ontology_id")
                 )
             elif clss == "tissue_ontology_term_id":
                 parentdf = (
                     bt.Tissue.filter()
-                    .df(include=["parents__ontology_id"])
+                    .df(include=["parents__ontology_id", "ontology_id"])
                     .set_index("ontology_id")
                 )
             elif clss == "disease_ontology_term_id":
                 parentdf = (
                     bt.Disease.filter()
-                    .df(include=["parents__ontology_id"])
+                    .df(include=["parents__ontology_id", "ontology_id"])
                     .set_index("ontology_id")
                 )
             elif clss in [
@@ -255,19 +243,19 @@ class Dataset(torchDataset):
             ]:
                 parentdf = (
                     bt.DevelopmentalStage.filter()
-                    .df(include=["parents__ontology_id"])
+                    .df(include=["parents__ontology_id", "ontology_id"])
                     .set_index("ontology_id")
                 )
             elif clss == "assay_ontology_term_id":
                 parentdf = (
                     bt.ExperimentalFactor.filter()
-                    .df(include=["parents__ontology_id"])
+                    .df(include=["parents__ontology_id", "ontology_id"])
                     .set_index("ontology_id")
                 )
             elif clss == "self_reported_ethnicity_ontology_term_id":
                 parentdf = (
                     bt.Ethnicity.filter()
-                    .df(include=["parents__ontology_id"])
+                    .df(include=["parents__ontology_id", "ontology_id"])
                     .set_index("ontology_id")
                 )
@@ -279,6 +267,7 @@ class Dataset(torchDataset):
                 )
             cats = set(self.mapped_dataset.encoders[clss].keys())
             groupings, _, leaf_labels = get_ancestry_mapping(cats, parentdf)
+            groupings.pop(None, None)
             for i, j in groupings.items():
                 if len(j) == 0:
                     # that should not happen
@@ -286,6 +275,7 @@ class Dataset(torchDataset):
                     pdb.set_trace()
                     groupings.pop(i)
             self.labels_groupings[clss] = groupings
             if clss in self.clss_to_predict:
                 # if we have added new clss, we need to update the encoder with them too.
@@ -331,9 +321,10 @@ class SimpleAnnDataset(torchDataset):
     def __init__(
         self,
         adata: AnnData,
-        obs_to_output: Optional[list[str]] = [],
+        obs_to_output: Optional[List[str]] = [],
         layer: Optional[str] = None,
         get_knn_cells: bool = False,
+        encoder: Optional[dict[str, dict]] = None,
     ):
         """
         SimpleAnnDataset is a simple dataloader for an AnnData dataset. this is to interface nicely with the rest of
@@ -342,12 +333,14 @@ class SimpleAnnDataset(torchDataset):
         Args:
         ----
             adata (anndata.AnnData): anndata object to use
-            obs_to_output (list[str]): list of observations to output from anndata.obs
+            obs_to_output (List[str]): list of observations to output from anndata.obs
             layer (str): layer of the anndata to use
             get_knn_cells (bool): whether to get the knn cells
+            encoder (dict[str, dict]): dictionary of encoders for the observations.
         """
         self.adataX = adata.layers[layer] if layer is not None else adata.X
         self.adataX = self.adataX.toarray() if issparse(self.adataX) else self.adataX
+        self.encoder = encoder if encoder is not None else {}
         self.obs_to_output = adata.obs[obs_to_output]
         self.get_knn_cells = get_knn_cells
@@ -361,23 +354,14 @@ class SimpleAnnDataset(torchDataset):
     def __iter__(self):
         for idx in range(self.adataX.shape[0]):
-            out = {"X": self.adataX[idx].reshape(-1)}
-            out.update(
-                {name: val for name, val in self.obs_to_output.iloc[idx].items()}
-            )
-            if self.get_knn_cells:
-                distances = self.distances[idx].toarray()[0]
-                nn_idx = np.argsort(-1 / (distances - 1e-6))[:6]
-                out["knn_cells"] = np.array(
-                    [self.adataX[i].reshape(-1) for i in nn_idx],
-                    dtype=int,
-                )
-                out["distances"] = distances[nn_idx]
+            out = self.__getitem__(idx)
             yield out
     def __getitem__(self, idx):
         out = {"X": self.adataX[idx].reshape(-1)}
-        out.update({name: val for name, val in self.obs_to_output.iloc[idx].items()})
+        # put the observation into the output and encode if needed
+        for name, val in self.obs_to_output.iloc[idx].items():
+            out.update({name: self.encoder[name][val] if name in self.encoder else val})
         if self.get_knn_cells:
             distances = self.distances[idx].toarray()[0]
             nn_idx = np.argsort(-1 / (distances - 1e-6))[:6]
@@ -385,17 +369,17 @@ class SimpleAnnDataset(torchDataset):
                 [self.adataX[i].reshape(-1) for i in nn_idx],
                 dtype=int,
             )
-            out["distances"] = distances[nn_idx]
+            out["knn_cells_info"] = distances[nn_idx]
         return out
 def mapped(
     dataset,
-    obs_keys: list[str] | None = None,
-    obsm_keys: list[str] | None = None,
+    obs_keys: List[str] | None = None,
+    obsm_keys: List[str] | None = None,
     obs_filter: dict[str, str | tuple[str, ...]] | None = None,
     join: Literal["inner", "outer"] | None = "inner",
-    encode_labels: bool | list[str] = True,
+    encode_labels: bool | List[str] = True,
     unknown_label: str | dict[str, str] | None = None,
     cache_categories: bool = True,
     parallel: bool = False,
@@ -403,7 +387,7 @@ def mapped(
     stream: bool = False,
     is_run_input: bool | None = None,
     metacell_mode: bool = False,
-    meta_assays: list[str] = ["EFO:0022857", "EFO:0010961"],
+    meta_assays: List[str] = ["EFO:0022857", "EFO:0010961"],
     get_knn_cells: bool = False,
     store_location: str | None = None,
     force_recompute_indices: bool = False,
@@ -440,7 +424,7 @@ def mapped(
     return ds
-def concat_categorical_codes(series_list: list[pd.Categorical]) -> pd.Categorical:
+def concat_categorical_codes(series_list: List[pd.Categorical]) -> pd.Categorical:
     """Efficiently combine multiple categorical data using their codes,
     only creating categories for combinations that exist in the data.

scdataloader 2.0.0__py3-none-any.whl → 2.0.3__py3-none-any.whl

scdataloader 2.0.0py3-none-any.whl → 2.0.3py3-none-any.whl