PyPI - scdataloader - Versions diffs - 1.9.2__py3-none-any.whl → 2.0.2__py3-none-any.whl - Mend

scdataloader 1.9.2py3-none-any.whl → 2.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

scdataloader/__main__.py +4 -5
scdataloader/collator.py +76 -78
scdataloader/config.py +25 -9
scdataloader/data.json +384 -0
scdataloader/data.py +134 -77
scdataloader/datamodule.py +638 -245
scdataloader/mapped.py +104 -43
scdataloader/preprocess.py +136 -110
scdataloader/utils.py +158 -52
{scdataloader-1.9.2.dist-info → scdataloader-2.0.2.dist-info}/METADATA +6 -7
scdataloader-2.0.2.dist-info/RECORD +16 -0
{scdataloader-1.9.2.dist-info → scdataloader-2.0.2.dist-info}/WHEEL +1 -1
scdataloader-2.0.2.dist-info/licenses/LICENSE +21 -0
scdataloader/VERSION +0 -1
scdataloader-1.9.2.dist-info/RECORD +0 -16
scdataloader-1.9.2.dist-info/licenses/LICENSE +0 -674
{scdataloader-1.9.2.dist-info → scdataloader-2.0.2.dist-info}/entry_points.txt +0 -0

scdataloader/data.py CHANGED Viewed

@@ -2,7 +2,7 @@ import warnings
 from collections import Counter
 from dataclasses import dataclass, field
 from functools import reduce
-from typing import Literal, Optional, Union
+from typing import List, Literal, Optional, Union
 # ln.connect("scprint")
 import bionty as bt
@@ -16,7 +16,6 @@ from torch.utils.data import Dataset as torchDataset
 from scdataloader.utils import get_ancestry_mapping, load_genes
-from .config import LABELS_TOADD
 from .mapped import MappedCollection, _Connect
@@ -39,28 +38,30 @@ class Dataset(torchDataset):
     ----
         lamin_dataset (lamindb.Dataset): lamin dataset to load
         genedf (pd.Dataframe): dataframe containing the genes to load
-        organisms (list[str]): list of organisms to load
-            (for now only validates the the genes map to this organism)
-        obs (list[str]): list of observations to load from the Collection
-        clss_to_predict (list[str]): list of observations to encode
+        obs (List[str]): list of observations to load from the Collection
+        clss_to_predict (List[str]): list of observations to encode
         join_vars (flag): join variables @see :meth:`~lamindb.Dataset.mapped`.
         hierarchical_clss: list of observations to map to a hierarchy using lamin's bionty
+        metacell_mode (float, optional): The mode to use for metacell sampling. Defaults to 0.0.
+        get_knn_cells (bool, optional): Whether to get the k-nearest neighbors of each cell. Defaults to False.
+        store_location (str, optional): The location to store the sampler indices. Defaults to None.
+        force_recompute_indices (bool, optional): Whether to force recompute the sampler indices. Defaults to False.
     """
     lamin_dataset: ln.Collection
     genedf: Optional[pd.DataFrame] = None
-    organisms: Optional[Union[list[str], str]] = field(
-        default_factory=["NCBITaxon:9606", "NCBITaxon:10090"]
-    )
     # set of obs to prepare for prediction (encode)
-    clss_to_predict: Optional[list[str]] = field(default_factory=list)
+    clss_to_predict: Optional[List[str]] = field(default_factory=list)
     # set of obs that need to be hierarchically prepared
-    hierarchical_clss: Optional[list[str]] = field(default_factory=list)
+    hierarchical_clss: Optional[List[str]] = field(default_factory=list)
     join_vars: Literal["inner", "outer"] | None = None
     metacell_mode: float = 0.0
     get_knn_cells: bool = False
+    store_location: str | None = None
+    force_recompute_indices: bool = False
     def __post_init__(self):
+        # see at the end of the file for the mapped function
         self.mapped_dataset = mapped(
             self.lamin_dataset,
             obs_keys=list(set(self.hierarchical_clss + self.clss_to_predict)),
@@ -71,6 +72,8 @@ class Dataset(torchDataset):
             parallel=True,
             metacell_mode=self.metacell_mode,
             get_knn_cells=self.get_knn_cells,
+            store_location=self.store_location,
+            force_recompute_indices=self.force_recompute_indices,
         )
         print(
             "won't do any check but we recommend to have your dataset coming from local storage"
@@ -85,7 +88,7 @@ class Dataset(torchDataset):
                 if clss not in self.hierarchical_clss:
                     # otherwise it's already been done
                     self.class_topred[clss] = set(
-                        self.mapped_dataset.get_merged_categories(clss)
+                        self.mapped_dataset.encoders[clss].keys()
                     )
                     if (
                         self.mapped_dataset.unknown_label
@@ -94,12 +97,19 @@ class Dataset(torchDataset):
                         self.class_topred[clss] -= set(
                             [self.mapped_dataset.unknown_label]
                         )
         if self.genedf is None:
+            if "organism_ontology_term_id" not in self.clss_to_predict:
+                raise ValueError(
+                    "need 'organism_ontology_term_id' in the set of classes if you don't provide a genedf"
+                )
+            self.organisms = list(self.class_topred["organism_ontology_term_id"])
             self.genedf = load_genes(self.organisms)
+        else:
+            self.organisms = self.genedf["organism"].unique().tolist()
+        self.organisms.sort()
         self.genedf.columns = self.genedf.columns.astype(str)
-        self.check_aligned_vars()
+        # self.check_aligned_vars()
     def check_aligned_vars(self):
         vars = self.genedf.index.tolist()
@@ -117,6 +127,10 @@ class Dataset(torchDataset):
     def encoder(self):
         return self.mapped_dataset.encoders
+    @encoder.setter
+    def encoder(self, encoder):
+        self.mapped_dataset.encoders = encoder
     def __getitem__(self, *args, **kwargs):
         item = self.mapped_dataset.__getitem__(*args, **kwargs)
         return item
@@ -132,7 +146,11 @@ class Dataset(torchDataset):
             + "     {} genes\n".format(self.genedf.shape[0])
             + "     {} clss_to_predict\n".format(len(self.clss_to_predict))
             + "     {} hierarchical_clss\n".format(len(self.hierarchical_clss))
-            + "     {} organisms\n".format(len(self.organisms))
+            + (
+                "     {} organisms\n".format(len(self.organisms))
+                if self.organisms is not None
+                else ""
+            )
             + (
                 "dataset contains {} classes to predict\n".format(
                     sum([len(self.class_topred[i]) for i in self.class_topred])
@@ -143,41 +161,21 @@ class Dataset(torchDataset):
             + "     {} metacell_mode\n".format(self.metacell_mode)
         )
-    def get_label_weights(
+    def get_label_cats(
         self,
-        obs_keys: str | list[str],
-        scaler: int = 10,
-        return_categories=False,
-        bypass_label=["neuron"],
+        obs_keys: Union[str, List[str]],
     ):
-        """Get all weights for the given label keys."""
+        """Get all categories for the given label keys."""
         if isinstance(obs_keys, str):
             obs_keys = [obs_keys]
-        labels_list = []
+        labels = None
         for label_key in obs_keys:
-            labels_to_str = (
-                self.mapped_dataset.get_merged_labels(label_key).astype(str).astype("O")
-            )
-            labels_list.append(labels_to_str)
-        if len(labels_list) > 1:
-            labels = ["___".join(labels_obs) for labels_obs in zip(*labels_list)]
-        else:
-            labels = labels_list[0]
-        counter = Counter(labels)  # type: ignore
-        if return_categories:
-            rn = {n: i for i, n in enumerate(counter.keys())}
-            labels = np.array([rn[label] for label in labels])
-            counter = np.array(list(counter.values()))
-            weights = scaler / (counter + scaler)
-            return weights, labels
-        else:
-            counts = np.array([counter[label] for label in labels])
-            if scaler is None:
-                weights = 1.0 / counts
+            labels_to_str = self.mapped_dataset.get_merged_labels(label_key)
+            if labels is None:
+                labels = labels_to_str
             else:
-                weights = scaler / (counts + scaler)
-            return weights
+                labels = concat_categorical_codes([labels, labels_to_str])
+        return np.array(labels.codes)
     def get_unseen_mapped_dataset_elements(self, idx: int):
         """
@@ -187,16 +185,16 @@ class Dataset(torchDataset):
             idx (int): index of the element to get
         Returns:
-            list[str]: list of unseen genes
+            List[str]: list of unseen genes
         """
         return [str(i)[2:-1] for i in self.mapped_dataset.uns(idx, "unseen_genes")]
-    def define_hierarchies(self, clsses: list[str]):
+    def define_hierarchies(self, clsses: List[str]):
         """
         define_hierarchies is a method to define the hierarchies for the classes to predict
         Args:
-            clsses (list[str]): list of classes to predict
+            clsses (List[str]): list of classes to predict
         Raises:
             ValueError: if the class is not in the accepted classes
@@ -223,19 +221,19 @@ class Dataset(torchDataset):
             elif clss == "cell_type_ontology_term_id":
                 parentdf = (
                     bt.CellType.filter()
-                    .df(include=["parents__ontology_id"])
+                    .df(include=["parents__ontology_id", "ontology_id"])
                     .set_index("ontology_id")
                 )
             elif clss == "tissue_ontology_term_id":
                 parentdf = (
                     bt.Tissue.filter()
-                    .df(include=["parents__ontology_id"])
+                    .df(include=["parents__ontology_id", "ontology_id"])
                     .set_index("ontology_id")
                 )
             elif clss == "disease_ontology_term_id":
                 parentdf = (
                     bt.Disease.filter()
-                    .df(include=["parents__ontology_id"])
+                    .df(include=["parents__ontology_id", "ontology_id"])
                     .set_index("ontology_id")
                 )
             elif clss in [
@@ -245,19 +243,19 @@ class Dataset(torchDataset):
             ]:
                 parentdf = (
                     bt.DevelopmentalStage.filter()
-                    .df(include=["parents__ontology_id"])
+                    .df(include=["parents__ontology_id", "ontology_id"])
                     .set_index("ontology_id")
                 )
             elif clss == "assay_ontology_term_id":
                 parentdf = (
                     bt.ExperimentalFactor.filter()
-                    .df(include=["parents__ontology_id"])
+                    .df(include=["parents__ontology_id", "ontology_id"])
                     .set_index("ontology_id")
                 )
             elif clss == "self_reported_ethnicity_ontology_term_id":
                 parentdf = (
                     bt.Ethnicity.filter()
-                    .df(include=["parents__ontology_id"])
+                    .df(include=["parents__ontology_id", "ontology_id"])
                     .set_index("ontology_id")
                 )
@@ -267,13 +265,17 @@ class Dataset(torchDataset):
                         clss
                     )
                 )
-            cats = set(self.mapped_dataset.get_merged_categories(clss))
-            addition = set(LABELS_TOADD.get(clss, {}).values())
-            cats |= addition
+            cats = set(self.mapped_dataset.encoders[clss].keys())
             groupings, _, leaf_labels = get_ancestry_mapping(cats, parentdf)
+            groupings.pop(None, None)
             for i, j in groupings.items():
                 if len(j) == 0:
+                    # that should not happen
+                    import pdb
+                    pdb.set_trace()
                     groupings.pop(i)
             self.labels_groupings[clss] = groupings
             if clss in self.clss_to_predict:
                 # if we have added new clss, we need to update the encoder with them too.
@@ -287,11 +289,12 @@ class Dataset(torchDataset):
                 )
                 for i, v in enumerate(
-                    addition - set(self.mapped_dataset.encoders[clss].keys())
+                    set(groupings.keys())
+                    - set(self.mapped_dataset.encoders[clss].keys())
                 ):
                     self.mapped_dataset.encoders[clss].update({v: mlength + i})
-                # we need to change the ordering so that the things that can't be predicted appear afterward
+                # we need to change the ordering so that the things that can't be predicted appear afterward
                 self.class_topred[clss] = leaf_labels
                 c = 0
                 update = {}
@@ -318,8 +321,10 @@ class SimpleAnnDataset(torchDataset):
     def __init__(
         self,
         adata: AnnData,
-        obs_to_output: Optional[list[str]] = [],
+        obs_to_output: Optional[List[str]] = [],
         layer: Optional[str] = None,
+        get_knn_cells: bool = False,
+        encoder: Optional[dict[str, dict]] = None,
     ):
         """
         SimpleAnnDataset is a simple dataloader for an AnnData dataset. this is to interface nicely with the rest of
@@ -328,43 +333,53 @@ class SimpleAnnDataset(torchDataset):
         Args:
         ----
             adata (anndata.AnnData): anndata object to use
-            obs_to_output (list[str]): list of observations to output from anndata.obs
+            obs_to_output (List[str]): list of observations to output from anndata.obs
             layer (str): layer of the anndata to use
+            get_knn_cells (bool): whether to get the knn cells
+            encoder (dict[str, dict]): dictionary of encoders for the observations.
         """
         self.adataX = adata.layers[layer] if layer is not None else adata.X
         self.adataX = self.adataX.toarray() if issparse(self.adataX) else self.adataX
+        self.encoder = encoder if encoder is not None else {}
         self.obs_to_output = adata.obs[obs_to_output]
+        self.get_knn_cells = get_knn_cells
+        if get_knn_cells and "connectivities" not in adata.obsp:
+            raise ValueError("neighbors key not found in adata.obsm")
+        if get_knn_cells:
+            self.distances = adata.obsp["distances"]
     def __len__(self):
         return self.adataX.shape[0]
     def __iter__(self):
-        for idx, obs in enumerate(self.adata.obs.itertuples(index=False)):
-            with warnings.catch_warnings():
-                warnings.filterwarnings("ignore", category=DeprecationWarning)
-                out = {"X": self.adataX[idx].reshape(-1)}
-                out.update(
-                    {name: val for name, val in self.obs_to_output.iloc[idx].items()}
-                )
-                yield out
+        for idx in range(self.adataX.shape[0]):
+            out = self.__getitem__(idx)
+            yield out
     def __getitem__(self, idx):
-        with warnings.catch_warnings():
-            warnings.filterwarnings("ignore", category=DeprecationWarning)
-            out = {"X": self.adataX[idx].reshape(-1)}
-            out.update(
-                {name: val for name, val in self.obs_to_output.iloc[idx].items()}
+        out = {"X": self.adataX[idx].reshape(-1)}
+        # put the observation into the output and encode if needed
+        for name, val in self.obs_to_output.iloc[idx].items():
+            out.update({name: self.encoder[name][val] if name in self.encoder else val})
+        if self.get_knn_cells:
+            distances = self.distances[idx].toarray()[0]
+            nn_idx = np.argsort(-1 / (distances - 1e-6))[:6]
+            out["knn_cells"] = np.array(
+                [self.adataX[i].reshape(-1) for i in nn_idx],
+                dtype=int,
             )
+            out["knn_cells_info"] = distances[nn_idx]
         return out
 def mapped(
     dataset,
-    obs_keys: list[str] | None = None,
-    obsm_keys: list[str] | None = None,
+    obs_keys: List[str] | None = None,
+    obsm_keys: List[str] | None = None,
     obs_filter: dict[str, str | tuple[str, ...]] | None = None,
     join: Literal["inner", "outer"] | None = "inner",
-    encode_labels: bool | list[str] = True,
+    encode_labels: bool | List[str] = True,
     unknown_label: str | dict[str, str] | None = None,
     cache_categories: bool = True,
     parallel: bool = False,
@@ -372,8 +387,10 @@ def mapped(
     stream: bool = False,
     is_run_input: bool | None = None,
     metacell_mode: bool = False,
-    meta_assays: list[str] = ["EFO:0022857", "EFO:0010961"],
+    meta_assays: List[str] = ["EFO:0022857", "EFO:0010961"],
     get_knn_cells: bool = False,
+    store_location: str | None = None,
+    force_recompute_indices: bool = False,
 ) -> MappedCollection:
     path_list = []
     for artifact in dataset.artifacts.all():
@@ -401,5 +418,45 @@ def mapped(
         meta_assays=meta_assays,
         metacell_mode=metacell_mode,
         get_knn_cells=get_knn_cells,
+        store_location=store_location,
+        force_recompute_indices=force_recompute_indices,
     )
     return ds
+def concat_categorical_codes(series_list: List[pd.Categorical]) -> pd.Categorical:
+    """Efficiently combine multiple categorical data using their codes,
+    only creating categories for combinations that exist in the data.
+    Args:
+        series_list: List of pandas Categorical data
+    Returns:
+        Combined Categorical with only existing combinations
+    """
+    # Get the codes for each categorical
+    codes_list = [s.codes.astype(np.int32) for s in series_list]
+    n_cats = [len(s.categories) for s in series_list]
+    # Calculate combined codes
+    combined_codes = codes_list[0]
+    multiplier = n_cats[0]
+    for codes, n_cat in zip(codes_list[1:], n_cats[1:]):
+        combined_codes = (combined_codes * n_cat) + codes
+        multiplier *= n_cat
+    # Find unique combinations that actually exist in the data
+    unique_existing_codes = np.unique(combined_codes)
+    # Create a mapping from old codes to new compressed codes
+    code_mapping = {old: new for new, old in enumerate(unique_existing_codes)}
+    # Map the combined codes to their new compressed values
+    combined_codes = np.array([code_mapping[code] for code in combined_codes])
+    # Create final categorical with only existing combinations
+    return pd.Categorical.from_codes(
+        codes=combined_codes,
+        categories=np.arange(len(unique_existing_codes)),
+        ordered=False,
+    )

scdataloader 1.9.2__py3-none-any.whl → 2.0.2__py3-none-any.whl

scdataloader 1.9.2py3-none-any.whl → 2.0.2py3-none-any.whl