PyPI - anndata - Versions diffs - 0.12.3__tar.gz → 0.12.5__tar.gz - Mend

anndata 0.12.3tar.gz → 0.12.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (212) hide show

{anndata-0.12.3 → anndata-0.12.5}/.github/workflows/test-cpu.yml RENAMED Viewed

@@ -43,7 +43,7 @@ jobs:
     strategy:
       matrix:
         env: ${{ fromJSON(needs.get-environments.outputs.envs) }}
-        io_mark: ["zarr_io", "not zarr_io"]
+        io_mark: ["zarr_io", "not zarr_io", "dask_distributed"] # dask_distributed should not be run with -n auto as it uses a client with processes
     env:  # environment variables for use in codecov’s env_vars tagging
       ENV_NAME: ${{ matrix.env.name }}
       IO_MARK: ${{ matrix.io_mark }}
@@ -72,7 +72,7 @@ jobs:
         env:
           COVERAGE_PROCESS_START: ${{ github.workspace }}/pyproject.toml
         run: |
-          hatch run ${{ matrix.env.name }}:run-cov -v --color=yes -n auto --junitxml=test-data/test-results.xml -m "${{ matrix.io_mark }}" ${{ matrix.env.args }}
+          hatch run ${{ matrix.env.name }}:run-cov -v --color=yes ${{ matrix.io_mark != 'dask_distributed' && '-n auto' || '' }} --junitxml=test-data/test-results.xml -m "${{ matrix.io_mark }}" ${{ matrix.env.args }}
           hatch run ${{ matrix.env.name }}:cov-combine
           hatch run ${{ matrix.env.name }}:coverage xml

{anndata-0.12.3 → anndata-0.12.5}/.github/workflows/test-gpu.yml RENAMED Viewed

@@ -63,7 +63,7 @@ jobs:
           echo "max_python_version=$max_version" >> $GITHUB_ENV
       - name: Install UV
-        uses: astral-sh/setup-uv@v6
+        uses: astral-sh/setup-uv@v6 # TODO: upgrade once cirun image supports node 24
         with:
           enable-cache: true
           python-version: ${{ env.max_python_version }}

{anndata-0.12.3 → anndata-0.12.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: anndata
-Version: 0.12.3
+Version: 0.12.5
 Summary: Annotated data.
 Project-URL: Documentation, https://anndata.readthedocs.io/
 Project-URL: Source, https://github.com/scverse/anndata

anndata-0.12.5/benchmarks/benchmarks/backed_hdf5.py ADDED Viewed

@@ -0,0 +1,112 @@
+from __future__ import annotations
+import numpy as np
+import pandas as pd
+from scipy import sparse
+import anndata as ad
+file_paths = {"sparse": "adata_sparse.h5ad"}
+class BackedHDF5Indexing:
+    param_names = ("arr_type",)
+    params = ("sparse",)
+    def setup_cache(self):
+        X_sparse = sparse.random(
+            10000,
+            50000,
+            density=0.01,
+            format="csr",
+            random_state=np.random.default_rng(42),
+        )
+        for X, arr_type in [
+            (X_sparse, "sparse"),
+        ]:
+            n_obs, n_var = X.shape
+            # Create obs and var dataframes
+            obs = pd.DataFrame(
+                {
+                    "cell_type": pd.Categorical(
+                        np.random.choice(["TypeA", "TypeB", "TypeC"], n_obs)
+                    ),
+                    "total_counts": np.random.randint(1000, 5000, n_obs),
+                },
+                index=[f"cell_{i}" for i in range(n_obs)],
+            )
+            var = pd.DataFrame(
+                {
+                    "gene_name": [f"gene_{i}" for i in range(n_var)],
+                },
+                index=[f"ENSG_{i:08d}" for i in range(n_var)],
+            )
+            # Create AnnData object and save to HDF5
+            adata = ad.AnnData(X=X, obs=obs, var=var)
+            # Create temporary file
+            adata.write_h5ad(file_paths[arr_type])
+    def setup(self, arr_type):
+        # Open as backed
+        self.adata_backed = ad.read_h5ad(file_paths[arr_type], backed="r")
+        self.n_obs, self.n_var = self.adata_backed.shape
+        # Prepare indices for duplicate index testing
+        self.obs_idx_with_dupes = np.array([0, 1, 0, 2, 1] * (self.n_obs // 100 + 1))[
+            : (self.n_obs // 10)
+        ]
+        self.var_idx_with_dupes = np.array([0, 1, 2, 0, 3] * (self.n_var // 100 + 1))[
+            : (self.n_var // 10)
+        ]
+        self.obs_idx_no_dupes = np.arange(0, self.n_obs, 10)
+        self.var_idx_no_dupes = np.arange(0, self.n_var, 10)
+    def time_slice_obs(self, *_):
+        """Time slicing observations from backed HDF5"""
+        self.adata_backed[0 : (self.n_obs // 2), :]
+    def time_slice_obs_to_memory(self, *_):
+        """Time slicing observations from backed HDF5"""
+        self.adata_backed[0 : (self.n_obs // 2), :].to_memory()
+    def peakmem_slice_obs(self, *_):
+        """Peak memory for slicing observations from backed HDF5"""
+        self.adata_backed[0 : (self.n_obs // 2), :]
+    def time_fancy_index_no_dupes(self, *_):
+        """Time fancy indexing without duplicates"""
+        self.adata_backed[self.obs_idx_no_dupes, self.var_idx_no_dupes]
+    def peakmem_fancy_index_no_dupes(self, *_):
+        """Peak memory for fancy indexing without duplicates"""
+        self.adata_backed[self.obs_idx_no_dupes, self.var_idx_no_dupes]
+    def time_fancy_index_no_dupes_to_memory(self, *_):
+        """Time fancy indexing without duplicates"""
+        self.adata_backed[self.obs_idx_no_dupes, self.var_idx_no_dupes].to_memory()
+    def time_index_with_dupes_obs(self, *_):
+        """Time fancy indexing with duplicate observation indices"""
+        self.adata_backed[self.obs_idx_with_dupes, :]
+    def peakmem_index_with_dupes_obs(self, *_):
+        """Peak memory for fancy indexing with duplicate observation indices"""
+        self.adata_backed[self.obs_idx_with_dupes, :]
+    def time_to_memory_subset(self, *_):
+        """Time converting subset to memory"""
+        subset = self.adata_backed[0 : (self.n_obs // 4), 0 : (self.n_var // 4)]
+        subset.to_memory()
+    def peakmem_to_memory_subset(self, *_):
+        """Peak memory for converting subset to memory"""
+        subset = self.adata_backed[0 : (self.n_obs // 4), 0 : (self.n_var // 4)]
+        subset.to_memory()
+    def teardown(self, *_):
+        """Clean up temporary files"""
+        if hasattr(self, "adata_backed"):
+            self.adata_backed.file.close()

anndata-0.12.5/benchmarks/benchmarks/dataset2d.py ADDED Viewed

@@ -0,0 +1,89 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
+import h5py
+import numpy as np
+import pandas as pd
+import zarr
+import anndata as ad
+if TYPE_CHECKING:
+    from typing import Literal
+class Dataset2D:
+    param_names = ("store_type", "chunks", "array_type")
+    params = (
+        ("zarr", "h5ad"),
+        ((-1,), None),
+        ("cat", "numeric", "string-array", "nullable-string-array"),
+    )
+    def setup_cache(self):
+        n_obs = 10000
+        array_types = {
+            "numeric": np.arange(n_obs),
+            "string-array": np.array(["a"] * n_obs),
+            "nullable-string-array": pd.array(
+                ["a", pd.NA] * (n_obs // 2), dtype="string"
+            ),
+            "cat": pd.Categorical(np.array(["a"] * n_obs)),
+        }
+        for k, v in array_types.items():
+            for store in [
+                h5py.File(f"data_{k}.h5ad", mode="w"),
+                zarr.open(f"data_{k}.zarr", mode="w", zarr_version=2),
+            ]:
+                df = pd.DataFrame({"a": v}, index=[f"cell{i}" for i in range(n_obs)])
+                if writing_string_array_on_disk := (
+                    isinstance(v, np.ndarray) and df["a"].dtype == "string"
+                ):
+                    df["a"] = df["a"].to_numpy()
+                with ad.settings.override(allow_write_nullable_strings=True):
+                    ad.io.write_elem(store, "df", df)
+                if writing_string_array_on_disk:
+                    assert store["df"]["a"].attrs["encoding-type"] == "string-array"
+    def setup(
+        self,
+        store_type: Literal["zarr", "h5ad"],
+        chunks: None | tuple[int],
+        array_type: Literal["cat", "numeric", "string-array", "nullable-string-array"],
+    ):
+        self.store = (
+            h5py.File(f"data_{array_type}.h5ad", mode="r")
+            if store_type == "h5ad"
+            else zarr.open(f"data_{array_type}.zarr")
+        )
+        self.ds = ad.experimental.read_elem_lazy(self.store["df"], chunks=chunks)
+        self.n_obs = self.ds.shape[0]
+    def time_read_lazy_default(self, *_):
+        ad.experimental.read_elem_lazy(self.store["df"])
+    def peakmem_read_lazy_default(self, *_):
+        ad.experimental.read_elem_lazy(self.store["df"])
+    def time_getitem_slice(self, *_):
+        self.ds.iloc[0 : (self.n_obs // 2)].to_memory()
+    def peakmem_getitem_slice(self, *_):
+        self.ds.iloc[0 : (self.n_obs // 2)].to_memory()
+    def time_full_to_memory(self, *_):
+        self.ds.to_memory()
+    def peakmem_full_to_memory(self, *_):
+        self.ds.to_memory()
+    def time_getitem_bool_mask(self, *_):
+        self.ds.iloc[np.random.randint(0, self.n_obs, self.n_obs // 2)].to_memory()
+    def peakmem_getitem_bool_mask(self, *_):
+        self.ds.iloc[np.random.randint(0, self.n_obs, self.n_obs // 2)].to_memory()
+    def time_concat(self, *_):
+        adatas = [ad.AnnData(obs=self.ds)] * 50
+        ad.concat(adatas, join="outer")

{anndata-0.12.3 → anndata-0.12.5}/benchmarks/benchmarks/readwrite.py RENAMED Viewed

@@ -38,52 +38,15 @@ from .utils import get_actualsize, get_peak_mem, sedate
 PBMC_3K_URL = "https://falexwolf.de/data/pbmc3k_raw.h5ad"
-# PBMC_3K_PATH = Path(__file__).parent / "data/pbmc3k_raw.h5ad"
-# PBMC_REDUCED_PATH = Path(__file__).parent / "10x_pbmc68k_reduced.h5ad"
-# BM_43K_CSR_PATH = Path(__file__).parent.parent / "datasets/BM2_43k-cells.h5ad"
-# BM_43K_CSC_PATH = Path(__file__).parent.parent / "datasets/BM2_43k-cells_CSC.h5ad"
-# class ZarrReadSuite:
-#     params = []
-#     param_names = ["input_url"]
-#     def setup(self, input_url):
-#         self.filepath = pooch.retrieve(url=input_url, known_hash=None)
-#     def time_read_full(self, input_url):
-#         anndata.read_zarr(self.filepath)
-#     def peakmem_read_full(self, input_url):
-#         anndata.read_zarr(self.filepath)
-#     def mem_readfull_object(self, input_url):
-#         return anndata.read_zarr(self.filepath)
-#     def track_read_full_memratio(self, input_url):
-#         mem_recording = memory_usage(
-#             (sedate(anndata.read_zarr, 0.005), (self.filepath,)), interval=0.001
-#         )
-#         adata = anndata.read_zarr(self.filepath)
-#         base_size = mem_recording[-1] - mem_recording[0]
-#         print(np.max(mem_recording) - np.min(mem_recording))
-#         print(base_size)
-#         return (np.max(mem_recording) - np.min(mem_recording)) / base_size
-#     def peakmem_read_backed(self, input_url):
-#         anndata.read_zarr(self.filepath, backed="r")
-#     def mem_read_backed_object(self, input_url):
-#         return anndata.read_zarr(self.filepath, backed="r")
 class H5ADInMemorySizeSuite:
-    _urls = MappingProxyType(dict(pbmc3k=PBMC_3K_URL))
-    params = _urls.keys()
-    param_names = ("input_data",)
+    filepath = "pbmc_in_mem.h5ad"
-    def setup(self, input_data: str):
-        self.filepath = pooch.retrieve(url=self._urls[input_data], known_hash=None)
+    def setup_cache(self):
+        # Need to specify path because the working directory is special for asv
+        pooch.retrieve(
+            url=PBMC_3K_URL, known_hash=None, path=Path.cwd(), fname=self.filepath
+        )
     def track_in_memory_size(self, *_):
         adata = anndata.read_h5ad(self.filepath)
@@ -99,12 +62,13 @@ class H5ADInMemorySizeSuite:
 class H5ADReadSuite:
-    _urls = MappingProxyType(dict(pbmc3k=PBMC_3K_URL))
-    params = _urls.keys()
-    param_names = ("input_data",)
+    filepath = "pbmc_read.h5ad"
-    def setup(self, input_data: str):
-        self.filepath = pooch.retrieve(url=self._urls[input_data], known_hash=None)
+    def setup_cache(self):
+        # Need to specify path because the working directory is special for asv
+        pooch.retrieve(
+            url=PBMC_3K_URL, known_hash=None, path=Path.cwd(), fname=self.filepath
+        )
     def time_read_full(self, *_):
         anndata.read_h5ad(self.filepath)

{anndata-0.12.3 → anndata-0.12.5}/benchmarks/benchmarks/sparse_dataset.py RENAMED Viewed

@@ -7,7 +7,7 @@ import zarr
 from dask.array.core import Array as DaskArray
 from scipy import sparse
-from anndata import AnnData
+from anndata import AnnData, concat
 from anndata._core.sparse_dataset import sparse_dataset
 from anndata._io.specs import write_elem
 from anndata.experimental import read_elem_lazy
@@ -21,7 +21,7 @@ def make_alternating_mask(n):
 class SparseCSRContiguousSlice:
-    _slices = MappingProxyType({
+    _indexers = MappingProxyType({
         "0:1000": slice(0, 1000),
         "0:9000": slice(0, 9000),
         ":9000:-1": slice(None, 9000, -1),
@@ -31,42 +31,80 @@ class SparseCSRContiguousSlice:
         "first": 0,
         "alternating": make_alternating_mask(10),
     })
+    filepath = "data.zarr"
     params = (
-        [
-            (10_000, 10_000),
-            # (10_000, 500)
-        ],
-        _slices.keys(),
+        list(_indexers.keys()),
         [True, False],
     )
-    param_names = ("shape", "slice", "use_dask")
+    param_names = (
+        "index",
+        "use_dask",
+    )
-    def setup(self, shape: tuple[int, int], slice: str, use_dask: bool):  # noqa: FBT001
+    def setup_cache(self):
         X = sparse.random(
-            *shape, density=0.01, format="csr", random_state=np.random.default_rng(42)
+            10_000,
+            10_000,
+            density=0.01,
+            format="csr",
+            random_state=np.random.default_rng(42),
         )
-        self.slice = self._slices[slice]
-        g = zarr.group()
+        g = zarr.group(self.filepath)
         write_elem(g, "X", X)
+    def setup(self, index: str, use_dask: bool):  # noqa: FBT001
+        g = zarr.open(self.filepath)
         self.x = read_elem_lazy(g["X"]) if use_dask else sparse_dataset(g["X"])
         self.adata = AnnData(self.x)
+        self.index = self._indexers[index]
     def time_getitem(self, *_):
-        res = self.x[self.slice]
+        res = self.x[self.index]
         if isinstance(res, DaskArray):
             res.compute()
     def peakmem_getitem(self, *_):
-        res = self.x[self.slice]
+        res = self.x[self.index]
         if isinstance(res, DaskArray):
             res.compute()
     def time_getitem_adata(self, *_):
-        res = self.adata[self.slice]
+        res = self.adata[self.index]
         if isinstance(res, DaskArray):
             res.compute()
     def peakmem_getitem_adata(self, *_):
-        res = self.adata[self.slice]
+        res = self.adata[self.index]
         if isinstance(res, DaskArray):
             res.compute()
+class SparseCSRDask:
+    filepath = "data.zarr"
+    def setup_cache(self):
+        X = sparse.random(
+            10_000,
+            10_000,
+            density=0.01,
+            format="csr",
+            random_state=np.random.default_rng(42),
+        )
+        g = zarr.group(self.filepath)
+        write_elem(g, "X", X)
+    def setup(self):
+        self.group = zarr.group(self.filepath)
+        self.adata = AnnData(X=read_elem_lazy(self.group["X"]))
+    def time_concat(self):
+        concat([self.adata for i in range(100)])
+    def peakmem_concat(self):
+        concat([self.adata for i in range(100)])
+    def time_read(self):
+        AnnData(X=read_elem_lazy(self.group["X"]))
+    def peakmem_read(self):
+        AnnData(X=read_elem_lazy(self.group["X"]))

{anndata-0.12.3 → anndata-0.12.5}/benchmarks/benchmarks/utils.py RENAMED Viewed

@@ -95,13 +95,31 @@ def gen_indexer(adata, dim, index_kind, ratio):
 def gen_adata(n_obs, n_var, attr_set):
     if "X-csr" in attr_set:
-        X = sparse.random(n_obs, n_var, density=0.1, format="csr")
+        X = sparse.random(
+            n_obs,
+            n_var,
+            density=0.1,
+            format="csr",
+            random_state=np.random.default_rng(42),
+        )
     elif "X-dense" in attr_set:
-        X = sparse.random(n_obs, n_var, density=0.1, format="csr")
+        X = sparse.random(
+            n_obs,
+            n_var,
+            density=0.1,
+            format="csr",
+            random_state=np.random.default_rng(42),
+        )
         X = X.toarray()
     else:
         # TODO: There's probably a better way to do this
-        X = sparse.random(n_obs, n_var, density=0, format="csr")
+        X = sparse.random(
+            n_obs,
+            n_var,
+            density=0,
+            format="csr",
+            random_state=np.random.default_rng(42),
+        )
     adata = AnnData(X)
     if "obs,var" in attr_set:
         adata.obs = pd.DataFrame(

anndata-0.12.5/docs/release-notes/0.12.4.md ADDED Viewed

@@ -0,0 +1,4 @@
+(v0.12.4)=
+### 0.12.4 {small}`2025-10-27`
+No significant changes.

anndata-0.12.5/docs/release-notes/0.12.5.md ADDED Viewed

@@ -0,0 +1,12 @@
+(v0.12.5)=
+### 0.12.5 {small}`2025-11-03`
+#### Bug fixes
+- Remove use of private `read_dataset` internally inside {func}`anndata.experimental.read_elem_lazy` {user}`ilan-gold` ({pr}`2158`)
+- Unblock version restriction on `dask` distributed writing by using threading scheduler always (see {pr}`2172`) {user}`ilan-gold` ({pr}`2183`)
+#### Performance
+- Use `name` on {func}`dask.array.map_blocks` internally when concatenating {class}`anndata.experimental.backed.Dataset2D` objects whose categoricals/nullable types must be converted to dask arrays {user}`ilan-gold` ({pr}`2121`)
+- Enable automatic sharding in zarr v3 via {attr}`anndata.settings.auto_shard_zarr_v3` (via {mod}`zarr`'s own auto sharding mechanism i.e., `shards="auto"`) for all types except {class}`numpy.recarray` {user}`ilan-gold` ({pr}`2167`)

anndata-0.12.5/docs/release-notes/2172.bug.md ADDED Viewed

@@ -0,0 +1 @@

+ {func}`dask.array.store` was producing corrupted data with zarr v3 + distributed scheduler + a lock (which we used internally): see {ref}`dask/dask#12109`. Thus dense arrays were potentially being stored with corrupted data. The solution is to remove the lock for newer versions of dask but without the lock in older versions, it is impossible to store the data. Thus versions of dask older than `2025.4.0` will not be supported for writing dense data. {user}`ilan-gold`

{anndata-0.12.3 → anndata-0.12.5}/docs/tutorials/zarr-v3.md RENAMED Viewed

@@ -38,7 +38,8 @@ There are two ways of opening remote `zarr` stores from the `zarr-python` packag
 Local data generally poses a different set of challenges.
 First, write speeds can be somewhat slow and second, the creation of many small files on a file system can slow down a filesystem.
 For the "many small files" problem, `zarr` has introduced {ref}`sharding <zarr:user-guide-sharding>` in the v3 file format.
-Sharding requires knowledge of the array element you are writing (such as shape or data type), though, and therefore you will need to use {func}`anndata.experimental.write_dispatched` to use sharding.
+We offer {attr}`anndata.settings.auto_shard_zarr_v3` to hook into zarr's ability to automatically compute shards, which is experimental at the moment.
+Manual sharding requires knowledge of the array element you are writing (such as shape or data type), though, and therefore you will need to use {func}`anndata.experimental.write_dispatched` to use custom sharding.
 For example, you cannot shard a 1D array with `shard` sizes `(256, 256)`.
 Here is a short example, although you should tune the sizes to your own use-case and also use the compression that makes the most sense for you:

{anndata-0.12.3 → anndata-0.12.5}/hatch.toml RENAMED Viewed

@@ -24,9 +24,13 @@ overrides.matrix.deps.env-vars = [
     { if = [ "min" ], key = "UV_CONSTRAINT", value = "ci/constraints.txt ci/min-deps.txt" },
 ]
 overrides.matrix.deps.pre-install-commands = [
-    { if = [ "min" ], value = "uv run ci/scripts/min-deps.py pyproject.toml --all-extras -o ci/min-deps.txt" },
-    # To prevent situations like https://github.com/pydata/xarray/issues/10419 going forward
-    { if = [ "pre" ], value = "echo xarray @ git+https://github.com/pydata/xarray.git > ci/pre-deps.txt" },
+    { if = [
+        "min",
+    ], value = "uv run ci/scripts/min-deps.py pyproject.toml --all-extras -o ci/min-deps.txt" },
+    # To prevent situations like https://github.com/pydata/xarray/issues/10419 going forward, and test against zarr as well
+    { if = [
+        "pre",
+    ], value = "echo 'xarray @ git+https://github.com/pydata/xarray.git\nzarr @ git+https://github.com/zarr-developers/zarr-python.git' > ci/pre-deps.txt" },
 ]
 overrides.matrix.deps.python = [

{anndata-0.12.3 → anndata-0.12.5}/pyproject.toml RENAMED Viewed

@@ -164,6 +164,7 @@ filterwarnings_when_strict = [
     "default:Consolidated metadata is:UserWarning",
     "default:.*Structured:zarr.core.dtype.common.UnstableSpecificationWarning",
     "default:.*FixedLengthUTF32:zarr.core.dtype.common.UnstableSpecificationWarning",
+    "default:Automatic shard shape inference is experimental",
 ]
 python_files = "test_*.py"
 testpaths = [
@@ -174,7 +175,11 @@ testpaths = [
 ]
 # For some reason this effects how logging is shown when tests are run
 xfail_strict = true
-markers = [ "gpu: mark test to run on GPU", "zarr_io: mark tests that involve zarr io" ]
+markers = [
+    "gpu: mark test to run on GPU",
+    "zarr_io: mark tests that involve zarr io",
+    "dask_distributed: tests that need a distributed client with multiple processes",
+]
 [tool.ruff]
 src = [ "src" ]

{anndata-0.12.3 → anndata-0.12.5}/src/anndata/_core/aligned_df.py RENAMED Viewed

@@ -78,6 +78,13 @@ def _gen_dataframe_df(
     attr: Literal["obs", "var"],
     length: int | None = None,
 ):
+    if isinstance(anno.index, pd.MultiIndex):
+        msg = (
+            "pandas.MultiIndex not supported as index for obs or var on declaration.\n\
+            You can set `obs_names` manually although most operations after will error or convert to str.\n\
+            This behavior will likely be clarified in a future breaking release."
+        )
+        raise ValueError(msg)
     if length is not None and length != len(anno):
         raise _mk_df_error(source, attr, length, len(anno))
     anno = anno.copy(deep=False)

anndata 0.12.3__tar.gz → 0.12.5__tar.gz

anndata 0.12.3tar.gz → 0.12.5tar.gz