PyPI - anndata - Versions diffs - 0.12.3__tar.gz → 0.12.4__tar.gz - Mend

anndata 0.12.3tar.gz → 0.12.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (210) hide show

{anndata-0.12.3 → anndata-0.12.4}/.github/workflows/test-cpu.yml RENAMED Viewed

@@ -43,7 +43,7 @@ jobs:
     strategy:
       matrix:
         env: ${{ fromJSON(needs.get-environments.outputs.envs) }}
-        io_mark: ["zarr_io", "not zarr_io"]
+        io_mark: ["zarr_io", "not zarr_io", "dask_distributed"] # dask_distributed should not be run with -n auto as it uses a client with processes
     env:  # environment variables for use in codecov’s env_vars tagging
       ENV_NAME: ${{ matrix.env.name }}
       IO_MARK: ${{ matrix.io_mark }}
@@ -72,7 +72,7 @@ jobs:
         env:
           COVERAGE_PROCESS_START: ${{ github.workspace }}/pyproject.toml
         run: |
-          hatch run ${{ matrix.env.name }}:run-cov -v --color=yes -n auto --junitxml=test-data/test-results.xml -m "${{ matrix.io_mark }}" ${{ matrix.env.args }}
+          hatch run ${{ matrix.env.name }}:run-cov -v --color=yes ${{ matrix.io_mark != 'dask_distributed' && '-n auto' || '' }} --junitxml=test-data/test-results.xml -m "${{ matrix.io_mark }}" ${{ matrix.env.args }}
           hatch run ${{ matrix.env.name }}:cov-combine
           hatch run ${{ matrix.env.name }}:coverage xml

{anndata-0.12.3 → anndata-0.12.4}/.github/workflows/test-gpu.yml RENAMED Viewed

@@ -63,7 +63,7 @@ jobs:
           echo "max_python_version=$max_version" >> $GITHUB_ENV
       - name: Install UV
-        uses: astral-sh/setup-uv@v6
+        uses: astral-sh/setup-uv@v6 # TODO: upgrade once cirun image supports node 24
         with:
           enable-cache: true
           python-version: ${{ env.max_python_version }}

{anndata-0.12.3 → anndata-0.12.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: anndata
-Version: 0.12.3
+Version: 0.12.4
 Summary: Annotated data.
 Project-URL: Documentation, https://anndata.readthedocs.io/
 Project-URL: Source, https://github.com/scverse/anndata

anndata-0.12.4/benchmarks/benchmarks/backed_hdf5.py ADDED Viewed

@@ -0,0 +1,112 @@
+from __future__ import annotations
+import numpy as np
+import pandas as pd
+from scipy import sparse
+import anndata as ad
+file_paths = {"sparse": "adata_sparse.h5ad"}
+class BackedHDF5Indexing:
+    param_names = ("arr_type",)
+    params = ("sparse",)
+    def setup_cache(self):
+        X_sparse = sparse.random(
+            10000,
+            50000,
+            density=0.01,
+            format="csr",
+            random_state=np.random.default_rng(42),
+        )
+        for X, arr_type in [
+            (X_sparse, "sparse"),
+        ]:
+            n_obs, n_var = X.shape
+            # Create obs and var dataframes
+            obs = pd.DataFrame(
+                {
+                    "cell_type": pd.Categorical(
+                        np.random.choice(["TypeA", "TypeB", "TypeC"], n_obs)
+                    ),
+                    "total_counts": np.random.randint(1000, 5000, n_obs),
+                },
+                index=[f"cell_{i}" for i in range(n_obs)],
+            )
+            var = pd.DataFrame(
+                {
+                    "gene_name": [f"gene_{i}" for i in range(n_var)],
+                },
+                index=[f"ENSG_{i:08d}" for i in range(n_var)],
+            )
+            # Create AnnData object and save to HDF5
+            adata = ad.AnnData(X=X, obs=obs, var=var)
+            # Create temporary file
+            adata.write_h5ad(file_paths[arr_type])
+    def setup(self, arr_type):
+        # Open as backed
+        self.adata_backed = ad.read_h5ad(file_paths[arr_type], backed="r")
+        self.n_obs, self.n_var = self.adata_backed.shape
+        # Prepare indices for duplicate index testing
+        self.obs_idx_with_dupes = np.array([0, 1, 0, 2, 1] * (self.n_obs // 100 + 1))[
+            : (self.n_obs // 10)
+        ]
+        self.var_idx_with_dupes = np.array([0, 1, 2, 0, 3] * (self.n_var // 100 + 1))[
+            : (self.n_var // 10)
+        ]
+        self.obs_idx_no_dupes = np.arange(0, self.n_obs, 10)
+        self.var_idx_no_dupes = np.arange(0, self.n_var, 10)
+    def time_slice_obs(self, *_):
+        """Time slicing observations from backed HDF5"""
+        self.adata_backed[0 : (self.n_obs // 2), :]
+    def time_slice_obs_to_memory(self, *_):
+        """Time slicing observations from backed HDF5"""
+        self.adata_backed[0 : (self.n_obs // 2), :].to_memory()
+    def peakmem_slice_obs(self, *_):
+        """Peak memory for slicing observations from backed HDF5"""
+        self.adata_backed[0 : (self.n_obs // 2), :]
+    def time_fancy_index_no_dupes(self, *_):
+        """Time fancy indexing without duplicates"""
+        self.adata_backed[self.obs_idx_no_dupes, self.var_idx_no_dupes]
+    def peakmem_fancy_index_no_dupes(self, *_):
+        """Peak memory for fancy indexing without duplicates"""
+        self.adata_backed[self.obs_idx_no_dupes, self.var_idx_no_dupes]
+    def time_fancy_index_no_dupes_to_memory(self, *_):
+        """Time fancy indexing without duplicates"""
+        self.adata_backed[self.obs_idx_no_dupes, self.var_idx_no_dupes].to_memory()
+    def time_index_with_dupes_obs(self, *_):
+        """Time fancy indexing with duplicate observation indices"""
+        self.adata_backed[self.obs_idx_with_dupes, :]
+    def peakmem_index_with_dupes_obs(self, *_):
+        """Peak memory for fancy indexing with duplicate observation indices"""
+        self.adata_backed[self.obs_idx_with_dupes, :]
+    def time_to_memory_subset(self, *_):
+        """Time converting subset to memory"""
+        subset = self.adata_backed[0 : (self.n_obs // 4), 0 : (self.n_var // 4)]
+        subset.to_memory()
+    def peakmem_to_memory_subset(self, *_):
+        """Peak memory for converting subset to memory"""
+        subset = self.adata_backed[0 : (self.n_obs // 4), 0 : (self.n_var // 4)]
+        subset.to_memory()
+    def teardown(self, *_):
+        """Clean up temporary files"""
+        if hasattr(self, "adata_backed"):
+            self.adata_backed.file.close()

{anndata-0.12.3 → anndata-0.12.4}/benchmarks/benchmarks/dataset2d.py RENAMED Viewed

@@ -1,7 +1,5 @@
 from __future__ import annotations
-import tempfile
-from pathlib import Path
 from typing import TYPE_CHECKING
 import h5py
@@ -12,35 +10,39 @@ import zarr
 import anndata as ad
 if TYPE_CHECKING:
-    from collections.abc import Callable
+    from typing import Literal
 class Dataset2D:
-    param_names = ("gen_store", "chunks")
+    param_names = ("store_type", "chunks")
     params = (
-        (
-            lambda: h5py.File(Path(tempfile.mkdtemp()) / "data.h5ad", mode="w"),
-            lambda: zarr.open(
-                Path(tempfile.mkdtemp()) / "data.zarr", mode="w", zarr_version=2
-            ),
-        ),
+        ("zarr", "h5ad"),
         ((-1,), None),
     )
-    def setup(
-        self, gen_store: Callable[[], zarr.Group | h5py.File], chunks: None | tuple[int]
-    ):
-        self.n_obs = 100000
+    def setup_cache(self):
+        n_obs = 100000
         df = pd.DataFrame(
             {
-                "a": pd.Categorical(np.array(["a"] * self.n_obs)),
-                "b": np.arange(self.n_obs),
+                "a": pd.Categorical(np.array(["a"] * n_obs)),
+                "b": np.arange(n_obs),
             },
-            index=[f"cell{i}" for i in range(self.n_obs)],
+            index=[f"cell{i}" for i in range(n_obs)],
+        )
+        for store in [
+            h5py.File("data.h5ad", mode="w"),
+            zarr.open("data.zarr", mode="w", zarr_version=2),
+        ]:
+            ad.io.write_elem(store, "obs", df)
+    def setup(self, store_type: Literal["zarr", "h5ad"], chunks: None | tuple[int]):
+        store = (
+            h5py.File("data.h5ad", mode="r")
+            if store_type == "h5ad"
+            else zarr.open("data.zarr")
         )
-        store = gen_store()
-        ad.io.write_elem(store, "obs", df)
         self.ds = ad.experimental.read_elem_lazy(store["obs"], chunks=chunks)
+        self.n_obs = self.ds.shape[0]
     def time_getitem_slice(self, *_):
         self.ds.iloc[0 : (self.n_obs // 2)].to_memory()

{anndata-0.12.3 → anndata-0.12.4}/benchmarks/benchmarks/readwrite.py RENAMED Viewed

@@ -38,52 +38,15 @@ from .utils import get_actualsize, get_peak_mem, sedate
 PBMC_3K_URL = "https://falexwolf.de/data/pbmc3k_raw.h5ad"
-# PBMC_3K_PATH = Path(__file__).parent / "data/pbmc3k_raw.h5ad"
-# PBMC_REDUCED_PATH = Path(__file__).parent / "10x_pbmc68k_reduced.h5ad"
-# BM_43K_CSR_PATH = Path(__file__).parent.parent / "datasets/BM2_43k-cells.h5ad"
-# BM_43K_CSC_PATH = Path(__file__).parent.parent / "datasets/BM2_43k-cells_CSC.h5ad"
-# class ZarrReadSuite:
-#     params = []
-#     param_names = ["input_url"]
-#     def setup(self, input_url):
-#         self.filepath = pooch.retrieve(url=input_url, known_hash=None)
-#     def time_read_full(self, input_url):
-#         anndata.read_zarr(self.filepath)
-#     def peakmem_read_full(self, input_url):
-#         anndata.read_zarr(self.filepath)
-#     def mem_readfull_object(self, input_url):
-#         return anndata.read_zarr(self.filepath)
-#     def track_read_full_memratio(self, input_url):
-#         mem_recording = memory_usage(
-#             (sedate(anndata.read_zarr, 0.005), (self.filepath,)), interval=0.001
-#         )
-#         adata = anndata.read_zarr(self.filepath)
-#         base_size = mem_recording[-1] - mem_recording[0]
-#         print(np.max(mem_recording) - np.min(mem_recording))
-#         print(base_size)
-#         return (np.max(mem_recording) - np.min(mem_recording)) / base_size
-#     def peakmem_read_backed(self, input_url):
-#         anndata.read_zarr(self.filepath, backed="r")
-#     def mem_read_backed_object(self, input_url):
-#         return anndata.read_zarr(self.filepath, backed="r")
 class H5ADInMemorySizeSuite:
-    _urls = MappingProxyType(dict(pbmc3k=PBMC_3K_URL))
-    params = _urls.keys()
-    param_names = ("input_data",)
+    filepath = "pbmc_in_mem.h5ad"
-    def setup(self, input_data: str):
-        self.filepath = pooch.retrieve(url=self._urls[input_data], known_hash=None)
+    def setup_cache(self):
+        # Need to specify path because the working directory is special for asv
+        pooch.retrieve(
+            url=PBMC_3K_URL, known_hash=None, path=Path.cwd(), fname=self.filepath
+        )
     def track_in_memory_size(self, *_):
         adata = anndata.read_h5ad(self.filepath)
@@ -99,12 +62,13 @@ class H5ADInMemorySizeSuite:
 class H5ADReadSuite:
-    _urls = MappingProxyType(dict(pbmc3k=PBMC_3K_URL))
-    params = _urls.keys()
-    param_names = ("input_data",)
+    filepath = "pbmc_read.h5ad"
-    def setup(self, input_data: str):
-        self.filepath = pooch.retrieve(url=self._urls[input_data], known_hash=None)
+    def setup_cache(self):
+        # Need to specify path because the working directory is special for asv
+        pooch.retrieve(
+            url=PBMC_3K_URL, known_hash=None, path=Path.cwd(), fname=self.filepath
+        )
     def time_read_full(self, *_):
         anndata.read_h5ad(self.filepath)

{anndata-0.12.3 → anndata-0.12.4}/benchmarks/benchmarks/sparse_dataset.py RENAMED Viewed

@@ -21,7 +21,7 @@ def make_alternating_mask(n):
 class SparseCSRContiguousSlice:
-    _slices = MappingProxyType({
+    _indexers = MappingProxyType({
         "0:1000": slice(0, 1000),
         "0:9000": slice(0, 9000),
         ":9000:-1": slice(None, 9000, -1),
@@ -31,42 +31,49 @@ class SparseCSRContiguousSlice:
         "first": 0,
         "alternating": make_alternating_mask(10),
     })
+    filepath = "data.zarr"
     params = (
-        [
-            (10_000, 10_000),
-            # (10_000, 500)
-        ],
-        _slices.keys(),
+        list(_indexers.keys()),
         [True, False],
     )
-    param_names = ("shape", "slice", "use_dask")
+    param_names = (
+        "index",
+        "use_dask",
+    )
-    def setup(self, shape: tuple[int, int], slice: str, use_dask: bool):  # noqa: FBT001
+    def setup_cache(self):
         X = sparse.random(
-            *shape, density=0.01, format="csr", random_state=np.random.default_rng(42)
+            10_000,
+            10_000,
+            density=0.01,
+            format="csr",
+            random_state=np.random.default_rng(42),
         )
-        self.slice = self._slices[slice]
-        g = zarr.group()
+        g = zarr.group(self.filepath)
         write_elem(g, "X", X)
+    def setup(self, index: str, use_dask: bool):  # noqa: FBT001
+        g = zarr.open(self.filepath)
         self.x = read_elem_lazy(g["X"]) if use_dask else sparse_dataset(g["X"])
         self.adata = AnnData(self.x)
+        self.index = self._indexers[index]
     def time_getitem(self, *_):
-        res = self.x[self.slice]
+        res = self.x[self.index]
         if isinstance(res, DaskArray):
             res.compute()
     def peakmem_getitem(self, *_):
-        res = self.x[self.slice]
+        res = self.x[self.index]
         if isinstance(res, DaskArray):
             res.compute()
     def time_getitem_adata(self, *_):
-        res = self.adata[self.slice]
+        res = self.adata[self.index]
         if isinstance(res, DaskArray):
             res.compute()
     def peakmem_getitem_adata(self, *_):
-        res = self.adata[self.slice]
+        res = self.adata[self.index]
         if isinstance(res, DaskArray):
             res.compute()

{anndata-0.12.3 → anndata-0.12.4}/benchmarks/benchmarks/utils.py RENAMED Viewed

@@ -95,13 +95,31 @@ def gen_indexer(adata, dim, index_kind, ratio):
 def gen_adata(n_obs, n_var, attr_set):
     if "X-csr" in attr_set:
-        X = sparse.random(n_obs, n_var, density=0.1, format="csr")
+        X = sparse.random(
+            n_obs,
+            n_var,
+            density=0.1,
+            format="csr",
+            random_state=np.random.default_rng(42),
+        )
     elif "X-dense" in attr_set:
-        X = sparse.random(n_obs, n_var, density=0.1, format="csr")
+        X = sparse.random(
+            n_obs,
+            n_var,
+            density=0.1,
+            format="csr",
+            random_state=np.random.default_rng(42),
+        )
         X = X.toarray()
     else:
         # TODO: There's probably a better way to do this
-        X = sparse.random(n_obs, n_var, density=0, format="csr")
+        X = sparse.random(
+            n_obs,
+            n_var,
+            density=0,
+            format="csr",
+            random_state=np.random.default_rng(42),
+        )
     adata = AnnData(X)
     if "obs,var" in attr_set:
         adata.obs = pd.DataFrame(

anndata-0.12.4/docs/release-notes/0.12.4.md ADDED Viewed

@@ -0,0 +1,4 @@
+(v0.12.4)=
+### 0.12.4 {small}`2025-10-27`
+No significant changes.

anndata-0.12.4/docs/release-notes/2172.bug.md ADDED Viewed

@@ -0,0 +1 @@

+ {func}`dask.array.store` was producing corrupted data with zarr v3 + distributed scheduler + a lock (which we used internally): see {ref}`dask/dask#12109`. Thus dense arrays were potentially being stored with corrupted data. The solution is to remove the lock for newer versions of dask but without the lock in older versions, it is impossible to store the data. Thus versions of dask older than `2025.4.0` will not be supported for writing dense data. {user}`ilan-gold`

{anndata-0.12.3 → anndata-0.12.4}/hatch.toml RENAMED Viewed

@@ -24,9 +24,13 @@ overrides.matrix.deps.env-vars = [
     { if = [ "min" ], key = "UV_CONSTRAINT", value = "ci/constraints.txt ci/min-deps.txt" },
 ]
 overrides.matrix.deps.pre-install-commands = [
-    { if = [ "min" ], value = "uv run ci/scripts/min-deps.py pyproject.toml --all-extras -o ci/min-deps.txt" },
-    # To prevent situations like https://github.com/pydata/xarray/issues/10419 going forward
-    { if = [ "pre" ], value = "echo xarray @ git+https://github.com/pydata/xarray.git > ci/pre-deps.txt" },
+    { if = [
+        "min",
+    ], value = "uv run ci/scripts/min-deps.py pyproject.toml --all-extras -o ci/min-deps.txt" },
+    # To prevent situations like https://github.com/pydata/xarray/issues/10419 going forward, and test against zarr as well
+    { if = [
+        "pre",
+    ], value = "echo 'xarray @ git+https://github.com/pydata/xarray.git\nzarr @ git+https://github.com/zarr-developers/zarr-python.git' > ci/pre-deps.txt" },
 ]
 overrides.matrix.deps.python = [

{anndata-0.12.3 → anndata-0.12.4}/pyproject.toml RENAMED Viewed

@@ -174,7 +174,11 @@ testpaths = [
 ]
 # For some reason this effects how logging is shown when tests are run
 xfail_strict = true
-markers = [ "gpu: mark test to run on GPU", "zarr_io: mark tests that involve zarr io" ]
+markers = [
+    "gpu: mark test to run on GPU",
+    "zarr_io: mark tests that involve zarr io",
+    "dask_distributed: tests that need a distributed client with multiple processes",
+]
 [tool.ruff]
 src = [ "src" ]

{anndata-0.12.3 → anndata-0.12.4}/src/anndata/_core/aligned_df.py RENAMED Viewed

@@ -78,6 +78,13 @@ def _gen_dataframe_df(
     attr: Literal["obs", "var"],
     length: int | None = None,
 ):
+    if isinstance(anno.index, pd.MultiIndex):
+        msg = (
+            "pandas.MultiIndex not supported as index for obs or var on declaration.\n\
+            You can set `obs_names` manually although most operations after will error or convert to str.\n\
+            This behavior will likely be clarified in a future breaking release."
+        )
+        raise ValueError(msg)
     if length is not None and length != len(anno):
         raise _mk_df_error(source, attr, length, len(anno))
     anno = anno.copy(deep=False)

{anndata-0.12.3 → anndata-0.12.4}/src/anndata/_core/index.py RENAMED Viewed

@@ -3,7 +3,7 @@ from __future__ import annotations
 from collections.abc import Iterable, Sequence
 from functools import singledispatch
 from itertools import repeat
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, cast, overload
 import h5py
 import numpy as np
@@ -14,6 +14,8 @@ from ..compat import AwkArray, CSArray, CSMatrix, DaskArray, XDataArray
 from .xarray import Dataset2D
 if TYPE_CHECKING:
+    from numpy.typing import NDArray
     from ..compat import Index, Index1D, Index1DNorm
@@ -161,7 +163,10 @@ def unpack_index(index: Index) -> tuple[Index1D, Index1D]:
 @singledispatch
-def _subset(a: np.ndarray | pd.DataFrame, subset_idx: Index):
+def _subset(
+    a: np.ndarray | pd.DataFrame,
+    subset_idx: tuple[Index1DNorm] | tuple[Index1DNorm, Index1DNorm],
+):
     # Select as combination of indexes, not coordinates
     # Correcting for indexing behaviour of np.ndarray
     if all(isinstance(x, Iterable) for x in subset_idx):
@@ -170,7 +175,9 @@ def _subset(a: np.ndarray | pd.DataFrame, subset_idx: Index):
 @_subset.register(DaskArray)
-def _subset_dask(a: DaskArray, subset_idx: Index):
+def _subset_dask(
+    a: DaskArray, subset_idx: tuple[Index1DNorm] | tuple[Index1DNorm, Index1DNorm]
+):
     if len(subset_idx) > 1 and all(isinstance(x, Iterable) for x in subset_idx):
         if issparse(a._meta) and a._meta.format == "csc":
             return a[:, subset_idx[1]][subset_idx[0], :]
@@ -180,24 +187,32 @@ def _subset_dask(a: DaskArray, subset_idx: Index):
 @_subset.register(CSMatrix)
 @_subset.register(CSArray)
-def _subset_sparse(a: CSMatrix | CSArray, subset_idx: Index):
+def _subset_sparse(
+    a: CSMatrix | CSArray,
+    subset_idx: tuple[Index1DNorm] | tuple[Index1DNorm, Index1DNorm],
+):
     # Correcting for indexing behaviour of sparse.spmatrix
     if len(subset_idx) > 1 and all(isinstance(x, Iterable) for x in subset_idx):
         first_idx = subset_idx[0]
         if issubclass(first_idx.dtype.type, np.bool_):
-            first_idx = np.where(first_idx)[0]
+            first_idx = np.flatnonzero(first_idx)
         subset_idx = (first_idx.reshape(-1, 1), *subset_idx[1:])
     return a[subset_idx]
 @_subset.register(pd.DataFrame)
 @_subset.register(Dataset2D)
-def _subset_df(df: pd.DataFrame | Dataset2D, subset_idx: Index):
+def _subset_df(
+    df: pd.DataFrame | Dataset2D,
+    subset_idx: tuple[Index1DNorm] | tuple[Index1DNorm, Index1DNorm],
+):
     return df.iloc[subset_idx]
 @_subset.register(AwkArray)
-def _subset_awkarray(a: AwkArray, subset_idx: Index):
+def _subset_awkarray(
+    a: AwkArray, subset_idx: tuple[Index1DNorm] | tuple[Index1DNorm, Index1DNorm]
+):
     if all(isinstance(x, Iterable) for x in subset_idx):
         subset_idx = np.ix_(*subset_idx)
     return a[subset_idx]
@@ -205,23 +220,121 @@ def _subset_awkarray(a: AwkArray, subset_idx: Index):
 # Registration for SparseDataset occurs in sparse_dataset.py
 @_subset.register(h5py.Dataset)
-def _subset_dataset(d: h5py.Dataset, subset_idx: Index):
-    if not isinstance(subset_idx, tuple):
-        subset_idx = (subset_idx,)
-    ordered = list(subset_idx)
-    rev_order = [slice(None) for _ in range(len(subset_idx))]
-    for axis, axis_idx in enumerate(ordered.copy()):
-        if isinstance(axis_idx, np.ndarray):
-            if axis_idx.dtype == bool:
-                axis_idx = np.where(axis_idx)[0]
-            order = np.argsort(axis_idx)
-            ordered[axis] = axis_idx[order]
-            rev_order[axis] = np.argsort(order)
+def _subset_dataset(
+    d: h5py.Dataset, subset_idx: tuple[Index1DNorm] | tuple[Index1DNorm, Index1DNorm]
+):
+    order: tuple[NDArray[np.integer] | slice, ...]
+    inv_order: tuple[NDArray[np.integer] | slice, ...]
+    order, inv_order = zip(*map(_index_order_and_inverse, subset_idx), strict=True)
+    # check for duplicates or multi-dimensional fancy indexing
+    array_dims = [i for i in order if isinstance(i, np.ndarray)]
+    has_duplicates = any(len(np.unique(i)) != len(i) for i in array_dims)
+    # Use safe indexing if there are duplicates OR multiple array dimensions
+    # (h5py doesn't support multi-dimensional fancy indexing natively)
+    if has_duplicates or len(array_dims) > 1:
+        # For multi-dimensional indexing, bypass the sorting logic and use original indices
+        return _safe_fancy_index_h5py(d, subset_idx)
     # from hdf5, then to real order
-    return d[tuple(ordered)][tuple(rev_order)]
-def make_slice(idx, dimidx, n=2):
+    return d[order][inv_order]
+@overload
+def _index_order_and_inverse(
+    axis_idx: NDArray[np.integer] | NDArray[np.bool_],
+) -> tuple[NDArray[np.integer], NDArray[np.integer]]: ...
+@overload
+def _index_order_and_inverse(axis_idx: slice) -> tuple[slice, slice]: ...
+def _index_order_and_inverse(
+    axis_idx: Index1DNorm,
+) -> tuple[Index1DNorm, NDArray[np.integer] | slice]:
+    """Order and get inverse index array."""
+    if not isinstance(axis_idx, np.ndarray):
+        return axis_idx, slice(None)
+    if axis_idx.dtype == bool:
+        axis_idx = np.flatnonzero(axis_idx)
+    order = np.argsort(axis_idx)
+    return axis_idx[order], np.argsort(order)
+@overload
+def _process_index_for_h5py(
+    idx: NDArray[np.integer] | NDArray[np.bool_],
+) -> tuple[NDArray[np.integer], NDArray[np.integer]]: ...
+@overload
+def _process_index_for_h5py(idx: slice) -> tuple[slice, None]: ...
+def _process_index_for_h5py(
+    idx: Index1DNorm,
+) -> tuple[Index1DNorm, NDArray[np.integer] | None]:
+    """Process a single index for h5py compatibility, handling sorting and duplicates."""
+    if not isinstance(idx, np.ndarray):
+        # Not an array (slice, integer, list) - no special processing needed
+        return idx, None
+    if idx.dtype == bool:
+        idx = np.flatnonzero(idx)
+    # For h5py fancy indexing, we need sorted indices
+    # But we also need to track how to reverse the sorting
+    unique, inverse = np.unique(idx, return_inverse=True)
+    return (
+        # Has duplicates - use unique + inverse mapping approach
+        (unique, inverse)
+        if len(unique) != len(idx)
+        # No duplicates - just sort and track reverse mapping
+        else _index_order_and_inverse(idx)
+    )
+def _safe_fancy_index_h5py(
+    dataset: h5py.Dataset,
+    subset_idx: tuple[Index1DNorm] | tuple[Index1DNorm, Index1DNorm],
+) -> h5py.Dataset:
+    # Handle multi-dimensional indexing of h5py dataset
+    # This avoids h5py's limitation with multi-dimensional fancy indexing
+    # without loading the entire dataset into memory
+    # Convert boolean arrays to integer arrays and handle sorting for h5py
+    processed_indices: tuple[NDArray[np.integer] | slice, ...]
+    reverse_indices: tuple[NDArray[np.integer] | None, ...]
+    processed_indices, reverse_indices = zip(
+        *map(_process_index_for_h5py, subset_idx), strict=True
+    )
+    # First find the index that reduces the size of the dataset the most
+    i_min = np.argmin([
+        _get_index_size(inds, dataset.shape[i]) / dataset.shape[i]
+        for i, inds in enumerate(processed_indices)
+    ])
+    # Apply the most selective index first to h5py dataset
+    first_index = [slice(None)] * len(processed_indices)
+    first_index[i_min] = processed_indices[i_min]
+    in_memory_array = cast("np.ndarray", dataset[tuple(first_index)])
+    # Apply remaining indices to the numpy array
+    remaining_indices = list(processed_indices)
+    remaining_indices[i_min] = slice(None)  # Already applied
+    result = in_memory_array[tuple(remaining_indices)]
+    # Now apply reverse mappings to get the original order
+    for dim, reverse_map in enumerate(reverse_indices):
+        if reverse_map is not None:
+            result = result.take(reverse_map, axis=dim)
+    return result
+def _get_index_size(idx: Index1DNorm, dim_size: int) -> int:
+    """Get size for any index type."""
+    if isinstance(idx, slice):
+        return len(range(*idx.indices(dim_size)))
+    elif isinstance(idx, int):
+        return 1
+    else:  # For other types, try to get length
+        return len(idx)
+def make_slice(idx, dimidx: int, n: int = 2) -> tuple[slice, ...]:
     mut = list(repeat(slice(None), n))
     mut[dimidx] = idx
     return tuple(mut)

anndata 0.12.3__tar.gz → 0.12.4__tar.gz

anndata 0.12.3tar.gz → 0.12.4tar.gz