PyPI - scdataloader - Versions diffs - 2.0.0__py3-none-any.whl → 2.0.3__py3-none-any.whl - Mend

scdataloader 2.0.0py3-none-any.whl → 2.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

scdataloader/__main__.py +4 -5
scdataloader/collator.py +65 -56
scdataloader/data.py +38 -54
scdataloader/datamodule.py +139 -86
scdataloader/mapped.py +27 -25
scdataloader/preprocess.py +31 -16
scdataloader/utils.py +120 -20
{scdataloader-2.0.0.dist-info → scdataloader-2.0.3.dist-info}/METADATA +5 -5
scdataloader-2.0.3.dist-info/RECORD +16 -0
{scdataloader-2.0.0.dist-info → scdataloader-2.0.3.dist-info}/WHEEL +1 -1
scdataloader-2.0.0.dist-info/RECORD +0 -16
{scdataloader-2.0.0.dist-info → scdataloader-2.0.3.dist-info}/entry_points.txt +0 -0
{scdataloader-2.0.0.dist-info → scdataloader-2.0.3.dist-info}/licenses/LICENSE +0 -0

scdataloader/datamodule.py CHANGED Viewed

@@ -1,17 +1,18 @@
+import math
 import multiprocessing as mp
 import os
 import random
 import time
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from functools import partial
-from typing import Optional, Sequence, Union
+from typing import List, Optional, Sequence, Union
 import lamindb as ln
 import lightning as L
 import numpy as np
 import pandas as pd
 import torch
-from torch.utils.data import DataLoader, Sampler
+from torch.utils.data import DataLoader, Sampler, Subset
 from torch.utils.data.sampler import (
     RandomSampler,
     SequentialSampler,
@@ -25,32 +26,30 @@ from .data import Dataset
 from .utils import fileToList, getBiomartTable, listToFile
 FILE_DIR = os.path.dirname(os.path.abspath(__file__))
+NNZ_SCALE = 1000
 class DataModule(L.LightningDataModule):
     def __init__(
         self,
         collection_name: str,
-        clss_to_weight: list = ["organism_ontology_term_id"],
+        clss_to_weight: List[str] = ["organism_ontology_term_id"],
         weight_scaler: int = 10,
         n_samples_per_epoch: int = 2_000_000,
         validation_split: float = 0.2,
         test_split: float = 0,
-        gene_embeddings: str = "",
         use_default_col: bool = True,
-        gene_position_tolerance: int = 10_000,
         # this is for the mappedCollection
-        clss_to_predict: list = ["organism_ontology_term_id"],
-        hierarchical_clss: list = [],
+        clss_to_predict: List[str] = ["organism_ontology_term_id"],
+        hierarchical_clss: List[str] = [],
         # this is for the collator
         how: str = "random expr",
-        organism_name: str = "organism_ontology_term_id",
+        organism_col: str = "organism_ontology_term_id",
         max_len: int = 1000,
-        add_zero_genes: int = 100,
         replacement: bool = True,
-        do_gene_pos: str = "",
+        gene_subset: Optional[list[str]] = None,
         tp_name: Optional[str] = None,  # "heat_diff"
-        assays_to_drop: list = [
+        assays_to_drop: List[str] = [
             # "EFO:0008853", #patch seq
             # "EFO:0010961", # visium
             "EFO:0030007",  # ATACseq
@@ -62,6 +61,11 @@ class DataModule(L.LightningDataModule):
         force_recompute_indices: bool = False,
         sampler_workers: int = None,
         sampler_chunk_size: int = None,
+        organisms: Optional[str] = None,
+        genedf: Optional[pd.DataFrame] = None,
+        n_bins: int = 0,
+        curiculum: int = 0,
+        start_at: int = 0,
         **kwargs,
     ):
         """
@@ -78,23 +82,19 @@ class DataModule(L.LightningDataModule):
             validation_split (float, optional): The proportion of the dataset to include in the validation split. Defaults to 0.2.
             test_split (float, optional): The proportion of the dataset to include in the test split. Defaults to 0.
                 it will use a full dataset and will round to the nearest dataset's cell count.
-            gene_embeddings (str, optional): The path to the gene embeddings file. Defaults to "".
+            use_default_col (bool, optional): Whether to use the default collator. Defaults to True.
+            clss_to_weight (List[str], optional): List of labels to weight in the trainer's weighted random sampler. Defaults to [].
+            assays_to_drop (List[str], optional): List of assays to drop from the dataset. Defaults to [].
+            gene_pos_file (Union[bool, str], optional): The path to the gene positions file. Defaults to True.
                 the file must have ensembl_gene_id as index.
                 This is used to subset the available genes further to the ones that have embeddings in your model.
-            use_default_col (bool, optional): Whether to use the default collator. Defaults to True.
-            gene_position_tolerance (int, optional): The tolerance for gene position. Defaults to 10_000.
-                any genes within this distance of each other will be considered at the same position.
-            clss_to_weight (list, optional): List of labels to weight in the trainer's weighted random sampler. Defaults to [].
-            assays_to_drop (list, optional): List of assays to drop from the dataset. Defaults to [].
-            do_gene_pos (Union[bool, str], optional): Whether to use gene positions. Defaults to True.
             max_len (int, optional): The maximum length of the input tensor. Defaults to 1000.
-            add_zero_genes (int, optional): The number of zero genes to add to the input tensor. Defaults to 100.
             how (str, optional): The method to use for the collator. Defaults to "random expr".
-            organism_name (str, optional): The name of the organism. Defaults to "organism_ontology_term_id".
+            organism_col (str, optional): The name of the organism. Defaults to "organism_ontology_term_id".
             tp_name (Optional[str], optional): The name of the timepoint. Defaults to None.
-            hierarchical_clss (list, optional): List of hierarchical classes. Defaults to [].
+            hierarchical_clss (List[str], optional): List of hierarchical classes. Defaults to [].
             metacell_mode (float, optional): The probability of using metacell mode. Defaults to 0.0.
-            clss_to_predict (list, optional): List of classes to predict. Defaults to ["organism_ontology_term_id"].
+            clss_to_predict (List[str], optional): List of classes to predict. Defaults to ["organism_ontology_term_id"].
             get_knn_cells (bool, optional): Whether to get the k-nearest neighbors of each queried cells. Defaults to False.
             store_location (str, optional): The location to store the sampler indices. Defaults to None.
             force_recompute_indices (bool, optional): Whether to force recompute the sampler indices. Defaults to False.
@@ -107,44 +107,44 @@ class DataModule(L.LightningDataModule):
             raise ValueError(
                 "need 'organism_ontology_term_id' in the set of classes at least"
             )
+        if metacell_mode > 0 and get_knn_cells:
+            raise ValueError(
+                "cannot use metacell mode and get_knn_cells at the same time"
+            )
         mdataset = Dataset(
-            ln.Collection.filter(name=collection_name, is_latest=True).first(),
+            ln.Collection.filter(key=collection_name, is_latest=True).first(),
             clss_to_predict=clss_to_predict,
             hierarchical_clss=hierarchical_clss,
             metacell_mode=metacell_mode,
             get_knn_cells=get_knn_cells,
             store_location=store_location,
             force_recompute_indices=force_recompute_indices,
+            genedf=genedf,
         )
         # and location
         self.metacell_mode = bool(metacell_mode)
         self.gene_pos = None
         self.collection_name = collection_name
-        if do_gene_pos:
-            biomart = pd.read_parquet(do_gene_pos)
-            mdataset.genedf = mdataset.genedf.join(biomart, how="inner")
-            self.gene_pos = mdataset.genedf["pos"].astype(int).tolist()
-        if gene_embeddings != "":
-            mdataset.genedf = mdataset.genedf.join(
-                pd.read_parquet(gene_embeddings).loc[:, :2], how="inner"
-            )
-            if do_gene_pos:
-                self.gene_pos = mdataset.genedf["pos"].tolist()
+        if gene_subset is not None:
+            tokeep = set(mdataset.genedf.index.tolist())
+            gene_subset = [u for u in gene_subset if u in tokeep]
         self.classes = {k: len(v) for k, v in mdataset.class_topred.items()}
         # we might want not to order the genes by expression (or do it?)
         # we might want to not introduce zeros and
         if use_default_col:
             kwargs["collate_fn"] = Collator(
-                organisms=mdataset.organisms,
+                organisms=mdataset.organisms if organisms is None else organisms,
                 how=how,
-                valid_genes=mdataset.genedf.index.tolist(),
+                valid_genes=gene_subset,
                 max_len=max_len,
-                add_zero_genes=add_zero_genes,
-                org_to_id=mdataset.encoder[organism_name],
+                org_to_id=mdataset.encoder[organism_col],
                 tp_name=tp_name,
-                organism_name=organism_name,
+                organism_name=organism_col,
                 class_names=list(self.classes.keys()),
+                genedf=genedf,
+                n_bins=n_bins,
             )
+        self.n_bins = n_bins
         self.validation_split = validation_split
         self.test_split = test_split
         self.dataset = mdataset
@@ -163,8 +163,13 @@ class DataModule(L.LightningDataModule):
         self.sampler_chunk_size = sampler_chunk_size
         self.store_location = store_location
         self.nnz = None
+        self.start_at = start_at
+        self.idx_full = None
+        self.max_len = max_len
         self.test_datasets = []
         self.force_recompute_indices = force_recompute_indices
+        self.curiculum = curiculum
+        self.valid_idx = []
         self.test_idx = []
         super().__init__()
         print("finished init")
@@ -183,9 +188,11 @@ class DataModule(L.LightningDataModule):
             f"\ttest datasets={str(self.test_datasets)},\n"
             f"perc test: {str(len(self.test_idx) / self.n_samples)},\n"
             f"\tclss_to_weight={self.clss_to_weight}\n"
-            + ("\twith train_dataset size of=(" + str(len(self.idx_full)) + ")\n)")
-            if self.idx_full is not None
-            else ")"
+            + (
+                "\twith train_dataset size of=(" + str(len(self.idx_full)) + ")\n)"
+                if self.idx_full is not None
+                else ")"
+            )
         )
     @property
@@ -229,12 +236,17 @@ class DataModule(L.LightningDataModule):
         """
         return self.dataset.genedf.index.tolist()
-    @genes.setter
-    def genes(self, genes):
-        self.dataset.genedf = self.dataset.genedf.loc[genes]
-        self.kwargs["collate_fn"].genes = genes
+    @property
+    def genes_dict(self):
+        return {
+            i: self.dataset.genedf.index[self.dataset.genedf.organism == i].tolist()
+            for i in self.dataset.organisms
+        }
+    def set_valid_genes_collator(self, genes):
         self.kwargs["collate_fn"]._setup(
-            genedf=self.dataset.genedf,
+            # cannot use genedf there since I am purposefully decreasing it...
+            # genedf=self.dataset.genedf,
             org_to_id=self.kwargs["collate_fn"].org_to_id,
             valid_genes=genes,
         )
@@ -280,14 +292,11 @@ class DataModule(L.LightningDataModule):
             stage (str, optional): The stage of the model training process.
             It can be either 'fit' or 'test'. Defaults to None.
         """
-        SCALE = 10
         print("setting up the datamodule")
         start_time = time.time()
         if (
             self.store_location is None
-            or not os.path.exists(
-                os.path.join(self.store_location, "train_weights.npy")
-            )
+            or not os.path.exists(os.path.join(self.store_location, "train_labels.npy"))
             or self.force_recompute_indices
         ):
             if "nnz" in self.clss_to_weight and self.weight_scaler > 0:
@@ -295,18 +304,19 @@ class DataModule(L.LightningDataModule):
                     "nnz", is_cat=False
                 )
                 self.clss_to_weight.remove("nnz")
-                (
-                    (self.nnz.max() / SCALE)
-                    / ((1 + self.nnz - self.nnz.min()) + (self.nnz.max() / SCALE))
-                ).min()
+                # Sigmoid scaling with 2 parameters
+                midpoint = 2000
+                steepness = 0.003
+                # Apply sigmoid transformation
+                # sigmoid(x) = 1 / (1 + exp(-steepness * (x - midpoint)))
+                # Then scale to [1, NNZ_SCALE] range
+                sigmoid_values = 1 / (1 + np.exp(-steepness * (self.nnz - midpoint)))
+                self.nnz = 1 + ((NNZ_SCALE - 1) * sigmoid_values)
             if len(self.clss_to_weight) > 0 and self.weight_scaler > 0:
-                weights, labels = self.dataset.get_label_weights(
+                labels = self.dataset.get_label_cats(
                     self.clss_to_weight,
-                    scaler=self.weight_scaler,
-                    return_categories=True,
                 )
             else:
-                weights = np.ones(1)
                 labels = np.zeros(self.n_samples, dtype=int)
             if isinstance(self.validation_split, int):
                 len_valid = self.validation_split
@@ -316,9 +326,9 @@ class DataModule(L.LightningDataModule):
                 len_test = self.test_split
             else:
                 len_test = int(self.n_samples * self.test_split)
-            assert len_test + len_valid < self.n_samples, (
-                "test set + valid set size is configured to be larger than entire dataset."
-            )
+            assert (
+                len_test + len_valid < self.n_samples
+            ), "test set + valid set size is configured to be larger than entire dataset."
             idx_full = []
             if len(self.assays_to_drop) > 0:
@@ -363,28 +373,22 @@ class DataModule(L.LightningDataModule):
                 idx_full = idx_full[len_valid:]
             else:
                 self.valid_idx = None
-            weights = np.concatenate([weights, np.zeros(1)])
-            labels[~np.isin(np.arange(self.n_samples), idx_full)] = len(weights) - 1
+            labels[~np.isin(np.arange(self.n_samples), idx_full)] = labels.max() + 1
             # some labels will now not exist anymore as replaced by len(weights) - 1.
             # this means that the associated weights should be 0.
             # by doing np.bincount(labels)*weights this will be taken into account
-            self.train_weights = weights
             self.train_labels = labels
             self.idx_full = idx_full
         if self.store_location is not None:
             if (
                 not os.path.exists(
-                    os.path.join(self.store_location, "train_weights.npy")
+                    os.path.join(self.store_location, "train_labels.npy")
                 )
                 or self.force_recompute_indices
             ):
                 os.makedirs(self.store_location, exist_ok=True)
                 if self.nnz is not None:
                     np.save(os.path.join(self.store_location, "nnz.npy"), self.nnz)
-                np.save(
-                    os.path.join(self.store_location, "train_weights.npy"),
-                    self.train_weights,
-                )
                 np.save(
                     os.path.join(self.store_location, "train_labels.npy"),
                     self.train_labels,
@@ -411,9 +415,6 @@ class DataModule(L.LightningDataModule):
                     if os.path.exists(os.path.join(self.store_location, "nnz.npy"))
                     else None
                 )
-                self.train_weights = np.load(
-                    os.path.join(self.store_location, "train_weights.npy")
-                )
                 self.train_labels = np.load(
                     os.path.join(self.store_location, "train_labels.npy")
                 )
@@ -446,8 +447,8 @@ class DataModule(L.LightningDataModule):
                 # Create the optimized parallel sampler
                 print(f"Using {self.sampler_workers} workers for class indexing")
                 train_sampler = LabelWeightedSampler(
-                    label_weights=self.train_weights,
                     labels=self.train_labels,
+                    weight_scaler=self.weight_scaler,
                     num_samples=int(self.n_samples_per_epoch),
                     element_weights=self.nnz,
                     replacement=self.replacement,
@@ -455,15 +456,18 @@ class DataModule(L.LightningDataModule):
                     chunk_size=self.sampler_chunk_size,
                     store_location=self.store_location,
                     force_recompute_indices=self.force_recompute_indices,
+                    curiculum=self.curiculum,
                 )
             except ValueError as e:
                 raise ValueError(str(e) + " Have you run `datamodule.setup()`?")
+            dataset = None
         else:
-            train_sampler = SubsetRandomSampler(self.idx_full)
+            dataset = Subset(self.dataset, self.idx_full)
+            train_sampler = RankShardSampler(len(dataset), start_at=self.start_at)
         current_loader_kwargs = kwargs.copy()
         current_loader_kwargs.update(self.kwargs)
         return DataLoader(
-            self.dataset,
+            self.dataset if dataset is None else dataset,
             sampler=train_sampler,
             **current_loader_kwargs,
         )
@@ -471,12 +475,11 @@ class DataModule(L.LightningDataModule):
     def val_dataloader(self):
         return (
             DataLoader(
-                self.dataset,
-                sampler=SubsetRandomSampler(self.valid_idx),
+                Subset(self.dataset, self.valid_idx),
                 **self.kwargs,
             )
             if self.valid_idx is not None
-            else None
+            else []
         )
     def test_dataloader(self):
@@ -485,20 +488,21 @@ class DataModule(L.LightningDataModule):
                 self.dataset, sampler=SequentialSampler(self.test_idx), **self.kwargs
             )
             if self.test_idx is not None
-            else None
+            else []
         )
     def predict_dataloader(self):
+        subset = Subset(self.dataset, self.idx_full)
         return DataLoader(
             self.dataset,
-            sampler=SubsetRandomSampler(self.idx_full),
+            sampler=RankShardSampler(len(subset), start_at=self.start_at),
             **self.kwargs,
         )
 class LabelWeightedSampler(Sampler[int]):
     """
-    A weighted random sampler that samples from a dataset with respect t o both class weights and element weights.
+    A weighted random sampler that samples from a dataset with respect to both class weights and element weights.
     This sampler is designed to handle very large datasets efficiently, with optimizations for:
     1. Parallel building of class indices
@@ -515,21 +519,22 @@ class LabelWeightedSampler(Sampler[int]):
     def __init__(
         self,
-        label_weights: Sequence[float],
         labels: np.ndarray,
         num_samples: int,
         replacement: bool = True,
+        weight_scaler: Optional[float] = None,
         element_weights: Optional[Sequence[float]] = None,
         n_workers: int = None,
         chunk_size: int = None,  # Process 10M elements per chunk
         store_location: str = None,
         force_recompute_indices: bool = False,
+        curiculum: int = 0,
     ) -> None:
         """
         Initialize the sampler with parallel processing for large datasets.
         Args:
-            label_weights: Weights for each class (length = number of classes)
+            weight_scaler: Scaling factor for class weights (higher means less balanced sampling)
             labels: Class label for each dataset element (length = dataset size)
             num_samples: Number of samples to draw
             replacement: Whether to sample with replacement
@@ -539,10 +544,14 @@ class LabelWeightedSampler(Sampler[int]):
         """
         print("Initializing optimized parallel weighted sampler...")
         super(LabelWeightedSampler, self).__init__(None)
+        self.count = 0
+        self.curiculum = curiculum
         # Compute label weights (incorporating class frequencies)
         # Directly use labels as numpy array without conversion
-        label_weights = np.asarray(label_weights) * np.bincount(labels)
+        counts = np.bincount(labels)
+        counts[-1] = 0  # Ensure the weight for the 'NONE' class is zero
+        label_weights = (weight_scaler * counts) / (counts + weight_scaler)
         self.label_weights = torch.as_tensor(
             label_weights, dtype=torch.float32
         ).share_memory_()
@@ -643,11 +652,16 @@ class LabelWeightedSampler(Sampler[int]):
         print(f"Done initializing sampler with {len(self.klass_offsets)} classes")
     def __iter__(self):
+        self.count += 1
         # Sample classes according to their weights
         print("sampling a new batch of size", self.num_samples)
         sample_labels = torch.multinomial(
-            self.label_weights,
+            (
+                self.label_weights ** min(1, ((self.count + 5) / self.curiculum))
+                if self.curiculum
+                else self.label_weights
+            ),
             num_samples=self.num_samples,
             replacement=True,
         )
@@ -655,7 +669,9 @@ class LabelWeightedSampler(Sampler[int]):
         unique_samples, sample_counts = torch.unique(sample_labels, return_counts=True)
         # Initialize result tensor
-        result_indices_list = []  # Changed name to avoid conflict if you had result_indices elsewhere
+        result_indices_list = (
+            []
+        )  # Changed name to avoid conflict if you had result_indices elsewhere
         # Process only the classes that were actually sampled
         for i, (label, count) in tqdm(
@@ -675,6 +691,11 @@ class LabelWeightedSampler(Sampler[int]):
                 # This is a critical point for memory
                 current_element_weights_slice = self.element_weights[klass_index]
+                if current_element_weights_slice.shape[0] >= (2**24) - 1:
+                    ind = torch.randperm(len(klass_index))[: (2**24) - 10]
+                    klass_index = klass_index[ind]
+                    current_element_weights_slice = current_element_weights_slice[ind]
                 if self.replacement:
                     right_inds = torch.multinomial(
                         current_element_weights_slice,
@@ -827,3 +848,35 @@ class LabelWeightedSampler(Sampler[int]):
             chunk_indices[int(label)] = indices[label_mask]
         return chunk_indices
+class RankShardSampler(Sampler[int]):
+    """Shards a dataset contiguously across ranks without padding or duplicates.
+    Preserves the existing order (e.g., your pre-shuffled idx_full)."""
+    def __init__(self, data_len: int, start_at: int = 0) -> None:
+        self.data_len = data_len
+        self.start_at = start_at
+        if torch.distributed.is_available() and torch.distributed.is_initialized():
+            self.rank = torch.distributed.get_rank()
+            self.world_size = torch.distributed.get_world_size()
+        else:
+            self.rank, self.world_size = 0, 1
+        # contiguous chunk per rank (last rank may be shorter)
+        if self.start_at > 0:
+            print(
+                "!!!!ATTTENTION: make sure that you are running on the exact same \
+                    number of GPU as your previous run!!!!!"
+            )
+        print(f"Sharding data of size {data_len} over {self.world_size} ranks")
+        per_rank = math.ceil((self.data_len - self.start_at) / self.world_size)
+        self.start = int((self.start_at / self.world_size) + (self.rank * per_rank))
+        self.end = min(self.start + per_rank, self.data_len)
+        print(f"Rank {self.rank} processing indices from {self.start} to {self.end}")
+    def __iter__(self):
+        return iter(range(self.start, self.end))
+    def __len__(self):
+        return self.end - self.start

scdataloader/mapped.py CHANGED Viewed

@@ -10,7 +10,7 @@ from __future__ import annotations
 import os
 from collections import Counter
 from functools import reduce
-from typing import TYPE_CHECKING, Literal
+from typing import TYPE_CHECKING, List, Literal
 import numpy as np
 import pandas as pd
@@ -117,20 +117,20 @@ class MappedCollection:
     def __init__(
         self,
-        path_list: list[UPathStr],
-        layers_keys: str | list[str] | None = None,
-        obs_keys: str | list[str] | None = None,
-        obsm_keys: str | list[str] | None = None,
+        path_list: List[UPathStr],
+        layers_keys: str | List[str] | None = None,
+        obs_keys: str | List[str] | None = None,
+        obsm_keys: str | List[str] | None = None,
         obs_filter: dict[str, str | tuple[str, ...]] | None = None,
         join: Literal["inner", "outer"] | None = "inner",
-        encode_labels: bool | list[str] = True,
+        encode_labels: bool | List[str] = True,
         unknown_label: str | dict[str, str] | None = None,
         cache_categories: bool = True,
         parallel: bool = False,
         dtype: str | None = None,
         metacell_mode: float = 0.0,
         get_knn_cells: bool = False,
-        meta_assays: list[str] = ["EFO:0022857", "EFO:0010961"],
+        meta_assays: List[str] = ["EFO:0022857", "EFO:0010961"],
         store_location: str | None = None,
         force_recompute_indices: bool = False,
     ):
@@ -200,7 +200,9 @@ class MappedCollection:
                         self._cache_categories(self.obs_keys)
                         torch.save(self._cache_cats, self.store_location)
                     else:
-                        self._cache_cats = torch.load(self.store_location)
+                        self._cache_cats = torch.load(
+                            self.store_location, weights_only=False
+                        )
                         print(f"Loaded categories from {self.store_location}")
             self.encoders: dict = {}
             if self.encode_labels:
@@ -348,7 +350,7 @@ class MappedCollection:
             vrs_sort_status = (vrs.is_monotonic_decreasing for vrs in self.var_list)
         return all(vrs_sort_status)
-    def check_vars_non_aligned(self, vars: pd.Index | list) -> list[int]:
+    def check_vars_non_aligned(self, vars: pd.Index | List) -> List[int]:
         """Returns indices of objects with non-aligned variables.
         Args:
@@ -380,7 +382,7 @@ class MappedCollection:
         return (self.n_obs, self.n_vars)
     @property
-    def original_shapes(self) -> list[tuple[int, int]]:
+    def original_shapes(self) -> List[tuple[int, int]]:
         """Shapes of the underlying AnnData objects (with `obs_filter` applied)."""
         if self.n_vars_list is None:
             n_vars_list = [None] * len(self.n_obs_list)
@@ -437,7 +439,20 @@ class MappedCollection:
                         print(out)
                         raise
-            if self.metacell_mode > 0:
+            if self.get_knn_cells:
+                distances = self._get_data_idx(store["obsp"]["distances"], obs_idx)
+                nn_idx = np.argsort(-1 / (distances - 1e-6))[:6]
+                out["knn_cells"] = np.array(
+                    [
+                        self._get_data_idx(
+                            lazy_data, i, self.join_vars, var_idxs_join, self.n_vars
+                        )
+                        for i in nn_idx
+                    ],
+                    dtype=int,
+                )
+                out["knn_cells_info"] = distances[nn_idx]
+            elif self.metacell_mode > 0:
                 if (
                     len(self.meta_assays) > 0
                     and "assay_ontology_term_id" in self.obs_keys
@@ -454,19 +469,6 @@ class MappedCollection:
                         out[layers_key] += self._get_data_idx(
                             lazy_data, i, self.join_vars, var_idxs_join, self.n_vars
                         )
-            elif self.get_knn_cells:
-                distances = self._get_data_idx(store["obsp"]["distances"], obs_idx)
-                nn_idx = np.argsort(-1 / (distances - 1e-6))[:6]
-                out["knn_cells"] = np.array(
-                    [
-                        self._get_data_idx(
-                            lazy_data, i, self.join_vars, var_idxs_join, self.n_vars
-                        )
-                        for i in nn_idx
-                    ],
-                    dtype=int,
-                )
-                out["distances"] = distances[nn_idx]
         return out
@@ -541,7 +543,7 @@ class MappedCollection:
     def get_label_weights(
         self,
-        obs_keys: str | list[str],
+        obs_keys: str | List[str],
         scaler: float | None = None,
         return_categories: bool = False,
     ):

scdataloader 2.0.0__py3-none-any.whl → 2.0.3__py3-none-any.whl

scdataloader 2.0.0py3-none-any.whl → 2.0.3py3-none-any.whl