PyPI - scdataloader - Versions diffs - 1.9.1__py3-none-any.whl → 2.0.0__py3-none-any.whl - Mend

scdataloader 1.9.1py3-none-any.whl → 2.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

scdataloader/__init__.py +2 -1
scdataloader/collator.py +30 -42
scdataloader/config.py +25 -9
scdataloader/data.json +384 -0
scdataloader/data.py +116 -43
scdataloader/datamodule.py +555 -225
scdataloader/mapped.py +84 -18
scdataloader/preprocess.py +108 -94
scdataloader/utils.py +39 -33
{scdataloader-1.9.1.dist-info → scdataloader-2.0.0.dist-info}/METADATA +13 -5
scdataloader-2.0.0.dist-info/RECORD +16 -0
scdataloader-2.0.0.dist-info/licenses/LICENSE +21 -0
scdataloader/VERSION +0 -1
scdataloader-1.9.1.dist-info/RECORD +0 -16
scdataloader-1.9.1.dist-info/licenses/LICENSE +0 -674
{scdataloader-1.9.1.dist-info → scdataloader-2.0.0.dist-info}/WHEEL +0 -0
{scdataloader-1.9.1.dist-info → scdataloader-2.0.0.dist-info}/entry_points.txt +0 -0

scdataloader/data.py CHANGED Viewed

@@ -16,7 +16,6 @@ from torch.utils.data import Dataset as torchDataset
 from scdataloader.utils import get_ancestry_mapping, load_genes
-from .config import LABELS_TOADD
 from .mapped import MappedCollection, _Connect
@@ -39,19 +38,18 @@ class Dataset(torchDataset):
     ----
         lamin_dataset (lamindb.Dataset): lamin dataset to load
         genedf (pd.Dataframe): dataframe containing the genes to load
-        organisms (list[str]): list of organisms to load
-            (for now only validates the the genes map to this organism)
         obs (list[str]): list of observations to load from the Collection
         clss_to_predict (list[str]): list of observations to encode
         join_vars (flag): join variables @see :meth:`~lamindb.Dataset.mapped`.
         hierarchical_clss: list of observations to map to a hierarchy using lamin's bionty
+        metacell_mode (float, optional): The mode to use for metacell sampling. Defaults to 0.0.
+        get_knn_cells (bool, optional): Whether to get the k-nearest neighbors of each cell. Defaults to False.
+        store_location (str, optional): The location to store the sampler indices. Defaults to None.
+        force_recompute_indices (bool, optional): Whether to force recompute the sampler indices. Defaults to False.
     """
     lamin_dataset: ln.Collection
     genedf: Optional[pd.DataFrame] = None
-    organisms: Optional[Union[list[str], str]] = field(
-        default_factory=["NCBITaxon:9606", "NCBITaxon:10090"]
-    )
     # set of obs to prepare for prediction (encode)
     clss_to_predict: Optional[list[str]] = field(default_factory=list)
     # set of obs that need to be hierarchically prepared
@@ -59,6 +57,8 @@ class Dataset(torchDataset):
     join_vars: Literal["inner", "outer"] | None = None
     metacell_mode: float = 0.0
     get_knn_cells: bool = False
+    store_location: str | None = None
+    force_recompute_indices: bool = False
     def __post_init__(self):
         self.mapped_dataset = mapped(
@@ -71,6 +71,8 @@ class Dataset(torchDataset):
             parallel=True,
             metacell_mode=self.metacell_mode,
             get_knn_cells=self.get_knn_cells,
+            store_location=self.store_location,
+            force_recompute_indices=self.force_recompute_indices,
         )
         print(
             "won't do any check but we recommend to have your dataset coming from local storage"
@@ -85,7 +87,7 @@ class Dataset(torchDataset):
                 if clss not in self.hierarchical_clss:
                     # otherwise it's already been done
                     self.class_topred[clss] = set(
-                        self.mapped_dataset.get_merged_categories(clss)
+                        self.mapped_dataset.encoders[clss].keys()
                     )
                     if (
                         self.mapped_dataset.unknown_label
@@ -94,12 +96,19 @@ class Dataset(torchDataset):
                         self.class_topred[clss] -= set(
                             [self.mapped_dataset.unknown_label]
                         )
         if self.genedf is None:
+            if "organism_ontology_term_id" not in self.clss_to_predict:
+                raise ValueError(
+                    "need 'organism_ontology_term_id' in the set of classes if you don't provide a genedf"
+                )
+            self.organisms = list(self.class_topred["organism_ontology_term_id"])
+            self.organisms.sort()
             self.genedf = load_genes(self.organisms)
+        else:
+            self.organisms = None
         self.genedf.columns = self.genedf.columns.astype(str)
-        self.check_aligned_vars()
+        # self.check_aligned_vars()
     def check_aligned_vars(self):
         vars = self.genedf.index.tolist()
@@ -117,6 +126,10 @@ class Dataset(torchDataset):
     def encoder(self):
         return self.mapped_dataset.encoders
+    @encoder.setter
+    def encoder(self, encoder):
+        self.mapped_dataset.encoders = encoder
     def __getitem__(self, *args, **kwargs):
         item = self.mapped_dataset.__getitem__(*args, **kwargs)
         return item
@@ -132,7 +145,11 @@ class Dataset(torchDataset):
             + "     {} genes\n".format(self.genedf.shape[0])
             + "     {} clss_to_predict\n".format(len(self.clss_to_predict))
             + "     {} hierarchical_clss\n".format(len(self.hierarchical_clss))
-            + "     {} organisms\n".format(len(self.organisms))
+            + (
+                "     {} organisms\n".format(len(self.organisms))
+                if self.organisms is not None
+                else ""
+            )
             + (
                 "dataset contains {} classes to predict\n".format(
                     sum([len(self.class_topred[i]) for i in self.class_topred])
@@ -148,31 +165,24 @@ class Dataset(torchDataset):
         obs_keys: str | list[str],
         scaler: int = 10,
         return_categories=False,
-        bypass_label=["neuron"],
     ):
         """Get all weights for the given label keys."""
         if isinstance(obs_keys, str):
             obs_keys = [obs_keys]
-        labels_list = []
+        labels = None
         for label_key in obs_keys:
-            labels_to_str = (
-                self.mapped_dataset.get_merged_labels(label_key).astype(str).astype("O")
-            )
-            labels_list.append(labels_to_str)
-        if len(labels_list) > 1:
-            labels = ["___".join(labels_obs) for labels_obs in zip(*labels_list)]
-        else:
-            labels = labels_list[0]
-        counter = Counter(labels)  # type: ignore
+            labels_to_str = self.mapped_dataset.get_merged_labels(label_key)
+            if labels is None:
+                labels = labels_to_str
+            else:
+                labels = concat_categorical_codes([labels, labels_to_str])
+        counter = Counter(labels.codes)  # type: ignore
         if return_categories:
-            rn = {n: i for i, n in enumerate(counter.keys())}
-            labels = np.array([rn[label] for label in labels])
             counter = np.array(list(counter.values()))
             weights = scaler / (counter + scaler)
-            return weights, labels
+            return weights, np.array(labels.codes)
         else:
-            counts = np.array([counter[label] for label in labels])
+            counts = np.array([counter[label] for label in labels.codes])
             if scaler is None:
                 weights = 1.0 / counts
             else:
@@ -267,12 +277,14 @@ class Dataset(torchDataset):
                         clss
                     )
                 )
-            cats = set(self.mapped_dataset.get_merged_categories(clss))
-            addition = set(LABELS_TOADD.get(clss, {}).values())
-            cats |= addition
+            cats = set(self.mapped_dataset.encoders[clss].keys())
             groupings, _, leaf_labels = get_ancestry_mapping(cats, parentdf)
             for i, j in groupings.items():
                 if len(j) == 0:
+                    # that should not happen
+                    import pdb
+                    pdb.set_trace()
                     groupings.pop(i)
             self.labels_groupings[clss] = groupings
             if clss in self.clss_to_predict:
@@ -287,11 +299,12 @@ class Dataset(torchDataset):
                 )
                 for i, v in enumerate(
-                    addition - set(self.mapped_dataset.encoders[clss].keys())
+                    set(groupings.keys())
+                    - set(self.mapped_dataset.encoders[clss].keys())
                 ):
                     self.mapped_dataset.encoders[clss].update({v: mlength + i})
-                # we need to change the ordering so that the things that can't be predicted appear afterward
+                # we need to change the ordering so that the things that can't be predicted appear afterward
                 self.class_topred[clss] = leaf_labels
                 c = 0
                 update = {}
@@ -320,6 +333,7 @@ class SimpleAnnDataset(torchDataset):
         adata: AnnData,
         obs_to_output: Optional[list[str]] = [],
         layer: Optional[str] = None,
+        get_knn_cells: bool = False,
     ):
         """
         SimpleAnnDataset is a simple dataloader for an AnnData dataset. this is to interface nicely with the rest of
@@ -330,31 +344,48 @@ class SimpleAnnDataset(torchDataset):
             adata (anndata.AnnData): anndata object to use
             obs_to_output (list[str]): list of observations to output from anndata.obs
             layer (str): layer of the anndata to use
+            get_knn_cells (bool): whether to get the knn cells
         """
         self.adataX = adata.layers[layer] if layer is not None else adata.X
         self.adataX = self.adataX.toarray() if issparse(self.adataX) else self.adataX
         self.obs_to_output = adata.obs[obs_to_output]
+        self.get_knn_cells = get_knn_cells
+        if get_knn_cells and "connectivities" not in adata.obsp:
+            raise ValueError("neighbors key not found in adata.obsm")
+        if get_knn_cells:
+            self.distances = adata.obsp["distances"]
     def __len__(self):
         return self.adataX.shape[0]
     def __iter__(self):
-        for idx, obs in enumerate(self.adata.obs.itertuples(index=False)):
-            with warnings.catch_warnings():
-                warnings.filterwarnings("ignore", category=DeprecationWarning)
-                out = {"X": self.adataX[idx].reshape(-1)}
-                out.update(
-                    {name: val for name, val in self.obs_to_output.iloc[idx].items()}
-                )
-                yield out
-    def __getitem__(self, idx):
-        with warnings.catch_warnings():
-            warnings.filterwarnings("ignore", category=DeprecationWarning)
+        for idx in range(self.adataX.shape[0]):
             out = {"X": self.adataX[idx].reshape(-1)}
             out.update(
                 {name: val for name, val in self.obs_to_output.iloc[idx].items()}
             )
+            if self.get_knn_cells:
+                distances = self.distances[idx].toarray()[0]
+                nn_idx = np.argsort(-1 / (distances - 1e-6))[:6]
+                out["knn_cells"] = np.array(
+                    [self.adataX[i].reshape(-1) for i in nn_idx],
+                    dtype=int,
+                )
+                out["distances"] = distances[nn_idx]
+            yield out
+    def __getitem__(self, idx):
+        out = {"X": self.adataX[idx].reshape(-1)}
+        out.update({name: val for name, val in self.obs_to_output.iloc[idx].items()})
+        if self.get_knn_cells:
+            distances = self.distances[idx].toarray()[0]
+            nn_idx = np.argsort(-1 / (distances - 1e-6))[:6]
+            out["knn_cells"] = np.array(
+                [self.adataX[i].reshape(-1) for i in nn_idx],
+                dtype=int,
+            )
+            out["distances"] = distances[nn_idx]
         return out
@@ -374,6 +405,8 @@ def mapped(
     metacell_mode: bool = False,
     meta_assays: list[str] = ["EFO:0022857", "EFO:0010961"],
     get_knn_cells: bool = False,
+    store_location: str | None = None,
+    force_recompute_indices: bool = False,
 ) -> MappedCollection:
     path_list = []
     for artifact in dataset.artifacts.all():
@@ -401,5 +434,45 @@ def mapped(
         meta_assays=meta_assays,
         metacell_mode=metacell_mode,
         get_knn_cells=get_knn_cells,
+        store_location=store_location,
+        force_recompute_indices=force_recompute_indices,
     )
     return ds
+def concat_categorical_codes(series_list: list[pd.Categorical]) -> pd.Categorical:
+    """Efficiently combine multiple categorical data using their codes,
+    only creating categories for combinations that exist in the data.
+    Args:
+        series_list: List of pandas Categorical data
+    Returns:
+        Combined Categorical with only existing combinations
+    """
+    # Get the codes for each categorical
+    codes_list = [s.codes.astype(np.int32) for s in series_list]
+    n_cats = [len(s.categories) for s in series_list]
+    # Calculate combined codes
+    combined_codes = codes_list[0]
+    multiplier = n_cats[0]
+    for codes, n_cat in zip(codes_list[1:], n_cats[1:]):
+        combined_codes = (combined_codes * n_cat) + codes
+        multiplier *= n_cat
+    # Find unique combinations that actually exist in the data
+    unique_existing_codes = np.unique(combined_codes)
+    # Create a mapping from old codes to new compressed codes
+    code_mapping = {old: new for new, old in enumerate(unique_existing_codes)}
+    # Map the combined codes to their new compressed values
+    combined_codes = np.array([code_mapping[code] for code in combined_codes])
+    # Create final categorical with only existing combinations
+    return pd.Categorical.from_codes(
+        codes=combined_codes,
+        categories=np.arange(len(unique_existing_codes)),
+        ordered=False,
+    )

scdataloader 1.9.1__py3-none-any.whl → 2.0.0__py3-none-any.whl

scdataloader 1.9.1py3-none-any.whl → 2.0.0py3-none-any.whl