anndata 0.12.4__tar.gz → 0.12.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {anndata-0.12.4 → anndata-0.12.5}/PKG-INFO +1 -1
- anndata-0.12.5/benchmarks/benchmarks/dataset2d.py +89 -0
- {anndata-0.12.4 → anndata-0.12.5}/benchmarks/benchmarks/sparse_dataset.py +32 -1
- anndata-0.12.5/docs/release-notes/0.12.5.md +12 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/tutorials/zarr-v3.md +2 -1
- {anndata-0.12.4 → anndata-0.12.5}/pyproject.toml +1 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/_core/merge.py +2 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/_io/specs/lazy_methods.py +6 -5
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/_io/specs/methods.py +15 -12
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/_settings.py +37 -12
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/_settings.pyi +3 -2
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/experimental/backed/_lazy_arrays.py +2 -2
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/tests/helpers.py +22 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_concatenate_disk.py +20 -3
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_dask.py +13 -14
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_dask_view_mem.py +1 -1
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_io_dispatched.py +7 -14
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_io_elementwise.py +68 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_settings.py +1 -1
- anndata-0.12.4/benchmarks/benchmarks/dataset2d.py +0 -63
- {anndata-0.12.4 → anndata-0.12.5}/.cirun.yml +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/.codecov.yml +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/.editorconfig +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/.github/ISSUE_TEMPLATE/bug-report.yml +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/.github/ISSUE_TEMPLATE/enhancement-request.yml +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/.github/ISSUE_TEMPLATE/question.yml +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/.github/dependabot.yml +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/.github/workflows/benchmark.yml +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/.github/workflows/check-pr.yml +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/.github/workflows/close-stale.yml +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/.github/workflows/codespell.yml +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/.github/workflows/label-stale.yml +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/.github/workflows/publish.yml +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/.github/workflows/test-cpu.yml +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/.github/workflows/test-gpu.yml +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/.gitignore +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/.gitmodules +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/.pre-commit-config.yaml +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/.prettierignore +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/.prettierrc.yaml +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/.readthedocs.yml +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/.taplo.toml +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/.vscode/launch.json +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/.vscode/settings.json +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/LICENSE +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/README.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/benchmarks/README.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/benchmarks/asv.conf.json +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/benchmarks/benchmarks/__init__.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/benchmarks/benchmarks/anndata.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/benchmarks/benchmarks/backed_hdf5.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/benchmarks/benchmarks/readwrite.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/benchmarks/benchmarks/utils.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/biome.jsonc +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/ci/constraints.txt +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/ci/scripts/min-deps.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/ci/scripts/towncrier_automation.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/Makefile +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/_key_contributors.rst +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/_static/img/anndata_schema.svg +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/_templates/autosummary/class.rst +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/api.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/benchmark-read-write.ipynb +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/benchmarks.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/concatenation.rst +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/conf.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/contributing.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/extensions/autosummary_skip_inherited.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/extensions/no_skip_abc_members.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/extensions/patch_myst_cite.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/fileformat-prose.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/index.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/interoperability.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/news.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/references.rst +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.10.0.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.10.1.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.10.2.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.10.3.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.10.4.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.10.5.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.10.6.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.10.7.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.10.8.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.10.9.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.11.0.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.11.1.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.11.2.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.11.3.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.11.4.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.12.0.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.12.1.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.12.2.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.12.3.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.12.4.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.4.0.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.5.0.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.6.0.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.6.x.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.7.0.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.7.2.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.7.3.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.7.4.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.7.5.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.7.6.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.7.7.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.7.8.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.8.0.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.9.0.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.9.1.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/0.9.2.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/2172.bug.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/release-notes/index.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/docs/tutorials/index.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/hatch.toml +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/__init__.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/_core/__init__.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/_core/access.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/_core/aligned_df.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/_core/aligned_mapping.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/_core/anndata.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/_core/extensions.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/_core/file_backing.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/_core/index.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/_core/raw.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/_core/sparse_dataset.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/_core/storage.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/_core/views.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/_core/xarray.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/_io/__init__.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/_io/h5ad.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/_io/read.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/_io/specs/__init__.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/_io/specs/registry.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/_io/utils.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/_io/write.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/_io/zarr.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/_types.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/_warnings.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/abc.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/compat/__init__.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/experimental/__init__.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/experimental/_dispatch_io.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/experimental/backed/__init__.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/experimental/backed/_compat.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/experimental/backed/_io.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/experimental/merge.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/experimental/multi_files/__init__.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/experimental/multi_files/_anncollection.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/experimental/pytorch/__init__.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/experimental/pytorch/_annloader.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/io.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/logging.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/tests/__init__.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/types.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/typing.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/anndata/utils.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/testing/anndata/__init__.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/testing/anndata/_doctest.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/testing/anndata/_pytest.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/src/testing/anndata/py.typed +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/conftest.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/data/adata-comments.tsv +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/data/adata.csv +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/data/archives/readme.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/data/archives/v0.11.4/adata.h5ad +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/data/archives/v0.11.4/adata.zarr.zip +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/data/archives/v0.11.4/readme.md +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/data/archives/v0.7.0/adata.h5ad +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/data/archives/v0.7.0/adata.zarr.zip +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/data/archives/v0.7.8/adata.h5ad +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/data/archives/v0.7.8/adata.zarr.zip +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/data/excel.xlsx +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/data/umi_tools.tsv.gz +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/lazy/conftest.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/lazy/test_concat.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/lazy/test_read.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/lazy/test_write.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_anncollection.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_annot.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_awkward.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_backed_dense.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_backed_hdf5.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_backed_sparse.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_base.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_concatenate.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_deprecations.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_extensions.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_get_vector.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_gpu.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_helpers.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_inplace_subset.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_io_backwards_compat.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_io_conversion.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_io_partial.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_io_utils.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_io_warnings.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_layers.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_obsmvarm.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_obspvarp.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_raw.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_readwrite.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_repr.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_structured_arrays.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_transpose.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_uns.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_utils.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_views.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_x.py +0 -0
- {anndata-0.12.4 → anndata-0.12.5}/tests/test_xarray.py +0 -0
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
import h5py
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
import zarr
|
|
9
|
+
|
|
10
|
+
import anndata as ad
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from typing import Literal
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Dataset2D:
|
|
17
|
+
param_names = ("store_type", "chunks", "array_type")
|
|
18
|
+
params = (
|
|
19
|
+
("zarr", "h5ad"),
|
|
20
|
+
((-1,), None),
|
|
21
|
+
("cat", "numeric", "string-array", "nullable-string-array"),
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
def setup_cache(self):
|
|
25
|
+
n_obs = 10000
|
|
26
|
+
array_types = {
|
|
27
|
+
"numeric": np.arange(n_obs),
|
|
28
|
+
"string-array": np.array(["a"] * n_obs),
|
|
29
|
+
"nullable-string-array": pd.array(
|
|
30
|
+
["a", pd.NA] * (n_obs // 2), dtype="string"
|
|
31
|
+
),
|
|
32
|
+
"cat": pd.Categorical(np.array(["a"] * n_obs)),
|
|
33
|
+
}
|
|
34
|
+
for k, v in array_types.items():
|
|
35
|
+
for store in [
|
|
36
|
+
h5py.File(f"data_{k}.h5ad", mode="w"),
|
|
37
|
+
zarr.open(f"data_{k}.zarr", mode="w", zarr_version=2),
|
|
38
|
+
]:
|
|
39
|
+
df = pd.DataFrame({"a": v}, index=[f"cell{i}" for i in range(n_obs)])
|
|
40
|
+
if writing_string_array_on_disk := (
|
|
41
|
+
isinstance(v, np.ndarray) and df["a"].dtype == "string"
|
|
42
|
+
):
|
|
43
|
+
df["a"] = df["a"].to_numpy()
|
|
44
|
+
with ad.settings.override(allow_write_nullable_strings=True):
|
|
45
|
+
ad.io.write_elem(store, "df", df)
|
|
46
|
+
if writing_string_array_on_disk:
|
|
47
|
+
assert store["df"]["a"].attrs["encoding-type"] == "string-array"
|
|
48
|
+
|
|
49
|
+
def setup(
|
|
50
|
+
self,
|
|
51
|
+
store_type: Literal["zarr", "h5ad"],
|
|
52
|
+
chunks: None | tuple[int],
|
|
53
|
+
array_type: Literal["cat", "numeric", "string-array", "nullable-string-array"],
|
|
54
|
+
):
|
|
55
|
+
self.store = (
|
|
56
|
+
h5py.File(f"data_{array_type}.h5ad", mode="r")
|
|
57
|
+
if store_type == "h5ad"
|
|
58
|
+
else zarr.open(f"data_{array_type}.zarr")
|
|
59
|
+
)
|
|
60
|
+
self.ds = ad.experimental.read_elem_lazy(self.store["df"], chunks=chunks)
|
|
61
|
+
self.n_obs = self.ds.shape[0]
|
|
62
|
+
|
|
63
|
+
def time_read_lazy_default(self, *_):
|
|
64
|
+
ad.experimental.read_elem_lazy(self.store["df"])
|
|
65
|
+
|
|
66
|
+
def peakmem_read_lazy_default(self, *_):
|
|
67
|
+
ad.experimental.read_elem_lazy(self.store["df"])
|
|
68
|
+
|
|
69
|
+
def time_getitem_slice(self, *_):
|
|
70
|
+
self.ds.iloc[0 : (self.n_obs // 2)].to_memory()
|
|
71
|
+
|
|
72
|
+
def peakmem_getitem_slice(self, *_):
|
|
73
|
+
self.ds.iloc[0 : (self.n_obs // 2)].to_memory()
|
|
74
|
+
|
|
75
|
+
def time_full_to_memory(self, *_):
|
|
76
|
+
self.ds.to_memory()
|
|
77
|
+
|
|
78
|
+
def peakmem_full_to_memory(self, *_):
|
|
79
|
+
self.ds.to_memory()
|
|
80
|
+
|
|
81
|
+
def time_getitem_bool_mask(self, *_):
|
|
82
|
+
self.ds.iloc[np.random.randint(0, self.n_obs, self.n_obs // 2)].to_memory()
|
|
83
|
+
|
|
84
|
+
def peakmem_getitem_bool_mask(self, *_):
|
|
85
|
+
self.ds.iloc[np.random.randint(0, self.n_obs, self.n_obs // 2)].to_memory()
|
|
86
|
+
|
|
87
|
+
def time_concat(self, *_):
|
|
88
|
+
adatas = [ad.AnnData(obs=self.ds)] * 50
|
|
89
|
+
ad.concat(adatas, join="outer")
|
|
@@ -7,7 +7,7 @@ import zarr
|
|
|
7
7
|
from dask.array.core import Array as DaskArray
|
|
8
8
|
from scipy import sparse
|
|
9
9
|
|
|
10
|
-
from anndata import AnnData
|
|
10
|
+
from anndata import AnnData, concat
|
|
11
11
|
from anndata._core.sparse_dataset import sparse_dataset
|
|
12
12
|
from anndata._io.specs import write_elem
|
|
13
13
|
from anndata.experimental import read_elem_lazy
|
|
@@ -77,3 +77,34 @@ class SparseCSRContiguousSlice:
|
|
|
77
77
|
res = self.adata[self.index]
|
|
78
78
|
if isinstance(res, DaskArray):
|
|
79
79
|
res.compute()
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class SparseCSRDask:
|
|
83
|
+
filepath = "data.zarr"
|
|
84
|
+
|
|
85
|
+
def setup_cache(self):
|
|
86
|
+
X = sparse.random(
|
|
87
|
+
10_000,
|
|
88
|
+
10_000,
|
|
89
|
+
density=0.01,
|
|
90
|
+
format="csr",
|
|
91
|
+
random_state=np.random.default_rng(42),
|
|
92
|
+
)
|
|
93
|
+
g = zarr.group(self.filepath)
|
|
94
|
+
write_elem(g, "X", X)
|
|
95
|
+
|
|
96
|
+
def setup(self):
|
|
97
|
+
self.group = zarr.group(self.filepath)
|
|
98
|
+
self.adata = AnnData(X=read_elem_lazy(self.group["X"]))
|
|
99
|
+
|
|
100
|
+
def time_concat(self):
|
|
101
|
+
concat([self.adata for i in range(100)])
|
|
102
|
+
|
|
103
|
+
def peakmem_concat(self):
|
|
104
|
+
concat([self.adata for i in range(100)])
|
|
105
|
+
|
|
106
|
+
def time_read(self):
|
|
107
|
+
AnnData(X=read_elem_lazy(self.group["X"]))
|
|
108
|
+
|
|
109
|
+
def peakmem_read(self):
|
|
110
|
+
AnnData(X=read_elem_lazy(self.group["X"]))
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
(v0.12.5)=
|
|
2
|
+
### 0.12.5 {small}`2025-11-03`
|
|
3
|
+
|
|
4
|
+
#### Bug fixes
|
|
5
|
+
|
|
6
|
+
- Remove use of private `read_dataset` internally inside {func}`anndata.experimental.read_elem_lazy` {user}`ilan-gold` ({pr}`2158`)
|
|
7
|
+
- Unblock version restriction on `dask` distributed writing by using threading scheduler always (see {pr}`2172`) {user}`ilan-gold` ({pr}`2183`)
|
|
8
|
+
|
|
9
|
+
#### Performance
|
|
10
|
+
|
|
11
|
+
- Use `name` on {func}`dask.array.map_blocks` internally when concatenating {class}`anndata.experimental.backed.Dataset2D` objects whose categoricals/nullable types must be converted to dask arrays {user}`ilan-gold` ({pr}`2121`)
|
|
12
|
+
- Enable automatic sharding in zarr v3 via {attr}`anndata.settings.auto_shard_zarr_v3` (via {mod}`zarr`'s own auto sharding mechanism i.e., `shards="auto"`) for all types except {class}`numpy.recarray` {user}`ilan-gold` ({pr}`2167`)
|
|
@@ -38,7 +38,8 @@ There are two ways of opening remote `zarr` stores from the `zarr-python` packag
|
|
|
38
38
|
Local data generally poses a different set of challenges.
|
|
39
39
|
First, write speeds can be somewhat slow and second, the creation of many small files on a file system can slow down a filesystem.
|
|
40
40
|
For the "many small files" problem, `zarr` has introduced {ref}`sharding <zarr:user-guide-sharding>` in the v3 file format.
|
|
41
|
-
|
|
41
|
+
We offer {attr}`anndata.settings.auto_shard_zarr_v3` to hook into zarr's ability to automatically compute shards, which is experimental at the moment.
|
|
42
|
+
Manual sharding requires knowledge of the array element you are writing (such as shape or data type), though, and therefore you will need to use {func}`anndata.experimental.write_dispatched` to use custom sharding.
|
|
42
43
|
For example, you cannot shard a 1D array with `shard` sizes `(256, 256)`.
|
|
43
44
|
Here is a short example, although you should tune the sizes to your own use-case and also use the compression that makes the most sense for you:
|
|
44
45
|
|
|
@@ -164,6 +164,7 @@ filterwarnings_when_strict = [
|
|
|
164
164
|
"default:Consolidated metadata is:UserWarning",
|
|
165
165
|
"default:.*Structured:zarr.core.dtype.common.UnstableSpecificationWarning",
|
|
166
166
|
"default:.*FixedLengthUTF32:zarr.core.dtype.common.UnstableSpecificationWarning",
|
|
167
|
+
"default:Automatic shard shape inference is experimental",
|
|
167
168
|
]
|
|
168
169
|
python_files = "test_*.py"
|
|
169
170
|
testpaths = [
|
|
@@ -4,6 +4,7 @@ Code for merging/ concatenating AnnData objects.
|
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
6
6
|
|
|
7
|
+
import uuid
|
|
7
8
|
from collections import OrderedDict
|
|
8
9
|
from collections.abc import Callable, Mapping, MutableSet
|
|
9
10
|
from functools import partial, reduce, singledispatch
|
|
@@ -1251,6 +1252,7 @@ def make_dask_col_from_extension_dtype(
|
|
|
1251
1252
|
chunks=chunk_size,
|
|
1252
1253
|
meta=np.array([], dtype=dtype),
|
|
1253
1254
|
dtype=dtype,
|
|
1255
|
+
name=f"{uuid.uuid4()}/{base_path_or_zarr_group}/{elem_name}-{dtype}",
|
|
1254
1256
|
)
|
|
1255
1257
|
|
|
1256
1258
|
return da.from_array(col.values, chunks=-1) # in-memory
|
|
@@ -25,7 +25,7 @@ from anndata.compat import (
|
|
|
25
25
|
ZarrGroup,
|
|
26
26
|
)
|
|
27
27
|
|
|
28
|
-
from .registry import _LAZY_REGISTRY, IOSpec
|
|
28
|
+
from .registry import _LAZY_REGISTRY, IOSpec, read_elem
|
|
29
29
|
|
|
30
30
|
if TYPE_CHECKING:
|
|
31
31
|
from collections.abc import Generator, Mapping, Sequence
|
|
@@ -195,6 +195,9 @@ def resolve_chunks(
|
|
|
195
195
|
return elem.chunks
|
|
196
196
|
|
|
197
197
|
|
|
198
|
+
# TODO: `map_blocks` of a string array in h5py is so insanely slow on benchmarking that in the case someone has
|
|
199
|
+
# a pure string annotation (not categoricals! or nullables strings!), it's probably better to pay the memory penalty.
|
|
200
|
+
# In the long run, it might be good to figure out what exactly is going on here but for now, this will do.
|
|
198
201
|
@_LAZY_REGISTRY.register_read(H5Array, IOSpec("string-array", "0.2.0"))
|
|
199
202
|
def read_h5_string_array(
|
|
200
203
|
elem: H5Array,
|
|
@@ -204,10 +207,8 @@ def read_h5_string_array(
|
|
|
204
207
|
) -> DaskArray:
|
|
205
208
|
import dask.array as da
|
|
206
209
|
|
|
207
|
-
from anndata._io.h5ad import read_dataset
|
|
208
|
-
|
|
209
210
|
chunks = resolve_chunks(elem, chunks, tuple(elem.shape))
|
|
210
|
-
return da.from_array(
|
|
211
|
+
return da.from_array(read_elem(elem), chunks=chunks)
|
|
211
212
|
|
|
212
213
|
|
|
213
214
|
@_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0"))
|
|
@@ -303,7 +304,7 @@ def read_dataframe(
|
|
|
303
304
|
# which is used below as well.
|
|
304
305
|
if not use_range_index:
|
|
305
306
|
dim_name = elem.attrs["_index"]
|
|
306
|
-
# no sense in reading this in multiple times
|
|
307
|
+
# no sense in reading this in multiple times since xarray requires an in-memory index
|
|
307
308
|
index = elem_dict[dim_name].compute()
|
|
308
309
|
else:
|
|
309
310
|
dim_name = DUMMY_RANGE_INDEX_KEY
|
|
@@ -102,6 +102,12 @@ def zarr_v3_compressor_compat(dataset_kwargs) -> dict:
|
|
|
102
102
|
return dataset_kwargs
|
|
103
103
|
|
|
104
104
|
|
|
105
|
+
def zarr_v3_sharding(dataset_kwargs) -> dict:
|
|
106
|
+
if "shards" not in dataset_kwargs and ad.settings.auto_shard_zarr_v3:
|
|
107
|
+
dataset_kwargs = {**dataset_kwargs, "shards": "auto"}
|
|
108
|
+
return dataset_kwargs
|
|
109
|
+
|
|
110
|
+
|
|
105
111
|
def _to_cpu_mem_wrapper(write_func):
|
|
106
112
|
"""
|
|
107
113
|
Wrapper to bring cupy types into cpu memory before writing.
|
|
@@ -432,6 +438,7 @@ def write_basic(
|
|
|
432
438
|
f.create_dataset(k, data=elem, shape=elem.shape, dtype=dtype, **dataset_kwargs)
|
|
433
439
|
else:
|
|
434
440
|
dataset_kwargs = zarr_v3_compressor_compat(dataset_kwargs)
|
|
441
|
+
dataset_kwargs = zarr_v3_sharding(dataset_kwargs)
|
|
435
442
|
f.create_array(k, shape=elem.shape, dtype=dtype, **dataset_kwargs)
|
|
436
443
|
# see https://github.com/zarr-developers/zarr-python/discussions/2712
|
|
437
444
|
if isinstance(elem, ZarrArray | H5Array):
|
|
@@ -506,26 +513,17 @@ def write_basic_dask_dask_dense(
|
|
|
506
513
|
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
|
|
507
514
|
):
|
|
508
515
|
import dask.array as da
|
|
509
|
-
import dask.config as dc
|
|
510
|
-
|
|
511
|
-
is_distributed = dc.get("scheduler", None) == "dask.distributed"
|
|
512
|
-
is_h5 = isinstance(f, H5Group)
|
|
513
|
-
if is_distributed and is_h5:
|
|
514
|
-
msg = "Cannot write dask arrays to hdf5 when using distributed scheduler"
|
|
515
|
-
raise ValueError(msg)
|
|
516
516
|
|
|
517
517
|
dataset_kwargs = dataset_kwargs.copy()
|
|
518
|
+
is_h5 = isinstance(f, H5Group)
|
|
518
519
|
if not is_h5:
|
|
519
520
|
dataset_kwargs = zarr_v3_compressor_compat(dataset_kwargs)
|
|
520
|
-
|
|
521
|
-
if Version(version("dask")) < Version("2025.4.0") and is_distributed:
|
|
522
|
-
msg = "Writing dense data with a distributed scheduler to zarr could produce corrupted data with a Lock and will error without one when dask is older than 2025.4.0: https://github.com/dask/dask/issues/12109"
|
|
523
|
-
raise RuntimeError(msg)
|
|
521
|
+
dataset_kwargs = zarr_v3_sharding(dataset_kwargs)
|
|
524
522
|
if is_zarr_v2() or is_h5:
|
|
525
523
|
g = f.require_dataset(k, shape=elem.shape, dtype=elem.dtype, **dataset_kwargs)
|
|
526
524
|
else:
|
|
527
525
|
g = f.require_array(k, shape=elem.shape, dtype=elem.dtype, **dataset_kwargs)
|
|
528
|
-
da.store(elem, g)
|
|
526
|
+
da.store(elem, g, scheduler="threads")
|
|
529
527
|
|
|
530
528
|
|
|
531
529
|
@_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0"))
|
|
@@ -626,6 +624,7 @@ def write_vlen_string_array_zarr(
|
|
|
626
624
|
filters, fill_value = None, None
|
|
627
625
|
if f.metadata.zarr_format == 2:
|
|
628
626
|
filters, fill_value = [VLenUTF8()], ""
|
|
627
|
+
dataset_kwargs = zarr_v3_sharding(dataset_kwargs)
|
|
629
628
|
f.create_array(
|
|
630
629
|
k,
|
|
631
630
|
shape=elem.shape,
|
|
@@ -694,6 +693,9 @@ def write_recarray_zarr(
|
|
|
694
693
|
else:
|
|
695
694
|
dataset_kwargs = dataset_kwargs.copy()
|
|
696
695
|
dataset_kwargs = zarr_v3_compressor_compat(dataset_kwargs)
|
|
696
|
+
# https://github.com/zarr-developers/zarr-python/issues/3546
|
|
697
|
+
# if "shards" not in dataset_kwargs and ad.settings.auto_shard_zarr_v3:
|
|
698
|
+
# dataset_kwargs = {**dataset_kwargs, "shards": "auto"}
|
|
697
699
|
f.create_array(k, shape=elem.shape, dtype=elem.dtype, **dataset_kwargs)
|
|
698
700
|
f[k][...] = elem
|
|
699
701
|
|
|
@@ -730,6 +732,7 @@ def write_sparse_compressed(
|
|
|
730
732
|
attr_name, data=attr, shape=attr.shape, dtype=dtype, **dataset_kwargs
|
|
731
733
|
)
|
|
732
734
|
else:
|
|
735
|
+
dataset_kwargs = zarr_v3_sharding(dataset_kwargs)
|
|
733
736
|
arr = g.create_array(
|
|
734
737
|
attr_name, shape=attr.shape, dtype=dtype, **dataset_kwargs
|
|
735
738
|
)
|
|
@@ -17,7 +17,7 @@ from .compat import is_zarr_v2, old_positionals
|
|
|
17
17
|
|
|
18
18
|
if TYPE_CHECKING:
|
|
19
19
|
from collections.abc import Callable, Sequence
|
|
20
|
-
from typing import Any, TypeGuard
|
|
20
|
+
from typing import Any, Self, TypeGuard
|
|
21
21
|
|
|
22
22
|
T = TypeVar("T")
|
|
23
23
|
|
|
@@ -55,7 +55,7 @@ class RegisteredOption(NamedTuple, Generic[T]):
|
|
|
55
55
|
option: str
|
|
56
56
|
default_value: T
|
|
57
57
|
description: str
|
|
58
|
-
validate: Callable[[T], None]
|
|
58
|
+
validate: Callable[[T, SettingsManager], None]
|
|
59
59
|
type: object
|
|
60
60
|
|
|
61
61
|
describe = describe
|
|
@@ -206,7 +206,7 @@ class SettingsManager:
|
|
|
206
206
|
*,
|
|
207
207
|
default_value: T,
|
|
208
208
|
description: str,
|
|
209
|
-
validate: Callable[[T], None],
|
|
209
|
+
validate: Callable[[T, Self], None],
|
|
210
210
|
option_type: object | None = None,
|
|
211
211
|
get_from_env: Callable[[str, T], T] = lambda x, y: y,
|
|
212
212
|
) -> None:
|
|
@@ -229,7 +229,7 @@ class SettingsManager:
|
|
|
229
229
|
Default behavior is to return `default_value` without checking the environment.
|
|
230
230
|
"""
|
|
231
231
|
try:
|
|
232
|
-
validate(default_value)
|
|
232
|
+
validate(default_value, self)
|
|
233
233
|
except (ValueError, TypeError) as e:
|
|
234
234
|
e.add_note(f"for option {option!r}")
|
|
235
235
|
raise e
|
|
@@ -307,7 +307,7 @@ class SettingsManager:
|
|
|
307
307
|
)
|
|
308
308
|
raise AttributeError(msg)
|
|
309
309
|
registered_option = self._registered_options[option]
|
|
310
|
-
registered_option.validate(val)
|
|
310
|
+
registered_option.validate(val, self)
|
|
311
311
|
self._config[option] = val
|
|
312
312
|
|
|
313
313
|
def __getattr__(self, option: str) -> object:
|
|
@@ -364,10 +364,13 @@ class SettingsManager:
|
|
|
364
364
|
"""
|
|
365
365
|
restore = {a: getattr(self, a) for a in overrides}
|
|
366
366
|
try:
|
|
367
|
-
|
|
368
|
-
|
|
367
|
+
# Preserve order so that settings that depend on each other can be overridden together i.e., always override zarr version before sharding
|
|
368
|
+
for k in self._config:
|
|
369
|
+
if k in overrides:
|
|
370
|
+
setattr(self, k, overrides.get(k))
|
|
369
371
|
yield None
|
|
370
372
|
finally:
|
|
373
|
+
# TODO: does the order need to be preserved when restoring?
|
|
371
374
|
for attr, value in restore.items():
|
|
372
375
|
setattr(self, attr, value)
|
|
373
376
|
|
|
@@ -395,7 +398,7 @@ V = TypeVar("V")
|
|
|
395
398
|
|
|
396
399
|
|
|
397
400
|
def gen_validator(_type: type[V]) -> Callable[[V], None]:
|
|
398
|
-
def validate_type(val: V) -> None:
|
|
401
|
+
def validate_type(val: V, settings: SettingsManager) -> None:
|
|
399
402
|
if not isinstance(val, _type):
|
|
400
403
|
msg = f"{val} not valid {_type}"
|
|
401
404
|
raise TypeError(msg)
|
|
@@ -434,14 +437,28 @@ settings.register(
|
|
|
434
437
|
)
|
|
435
438
|
|
|
436
439
|
|
|
437
|
-
def validate_zarr_write_format(format: int):
|
|
438
|
-
validate_int(format)
|
|
440
|
+
def validate_zarr_write_format(format: int, settings: SettingsManager):
|
|
441
|
+
validate_int(format, settings)
|
|
439
442
|
if format not in {2, 3}:
|
|
440
443
|
msg = "non-v2 zarr on-disk format not supported"
|
|
441
444
|
raise ValueError(msg)
|
|
442
445
|
if format == 3 and is_zarr_v2():
|
|
443
446
|
msg = "Cannot write v3 format against v2 package"
|
|
444
447
|
raise ValueError(msg)
|
|
448
|
+
if format == 2 and getattr(settings, "auto_shard_zarr_v3", False):
|
|
449
|
+
msg = "Cannot set `zarr_write_format` to 2 with autosharding on. Please set to `False` `anndata.settings.auto_shard_zarr_v3`"
|
|
450
|
+
raise ValueError(msg)
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
def validate_zarr_sharding(auto_shard: bool, settings: SettingsManager): # noqa: FBT001
|
|
454
|
+
validate_bool(auto_shard, settings)
|
|
455
|
+
if auto_shard:
|
|
456
|
+
if is_zarr_v2():
|
|
457
|
+
msg = "Cannot use sharding with `zarr-python<3`. Please upgrade package and set `anndata.settings.zarr_write_format` to 3."
|
|
458
|
+
raise ValueError(msg)
|
|
459
|
+
if settings.zarr_write_format == 2:
|
|
460
|
+
msg = "Cannot shard v2 format data. Please set `anndata.settings.zarr_write_format` to 3."
|
|
461
|
+
raise ValueError(msg)
|
|
445
462
|
|
|
446
463
|
|
|
447
464
|
settings.register(
|
|
@@ -458,8 +475,8 @@ settings.register(
|
|
|
458
475
|
)
|
|
459
476
|
|
|
460
477
|
|
|
461
|
-
def validate_sparse_settings(val: Any) -> None:
|
|
462
|
-
validate_bool(val)
|
|
478
|
+
def validate_sparse_settings(val: Any, settings: SettingsManager) -> None:
|
|
479
|
+
validate_bool(val, settings)
|
|
463
480
|
|
|
464
481
|
|
|
465
482
|
settings.register(
|
|
@@ -486,6 +503,14 @@ settings.register(
|
|
|
486
503
|
get_from_env=check_and_get_bool,
|
|
487
504
|
)
|
|
488
505
|
|
|
506
|
+
settings.register(
|
|
507
|
+
"auto_shard_zarr_v3",
|
|
508
|
+
default_value=False,
|
|
509
|
+
description="Whether or not to use zarr's auto computation of sharding for v3. For v2 this setting will be ignored. The setting will apply to all calls to anndata's writing mechanism (write_zarr / write_elem) and will **not** override any user-defined kwargs for shards.",
|
|
510
|
+
validate=validate_zarr_sharding,
|
|
511
|
+
get_from_env=check_and_get_bool,
|
|
512
|
+
)
|
|
513
|
+
|
|
489
514
|
|
|
490
515
|
##################################################################################
|
|
491
516
|
##################################################################################
|
|
@@ -2,7 +2,7 @@ from collections.abc import Callable as Callable
|
|
|
2
2
|
from collections.abc import Generator, Iterable
|
|
3
3
|
from contextlib import contextmanager
|
|
4
4
|
from dataclasses import dataclass
|
|
5
|
-
from typing import Literal, TypeVar
|
|
5
|
+
from typing import Literal, Self, TypeVar
|
|
6
6
|
|
|
7
7
|
_T = TypeVar("_T")
|
|
8
8
|
|
|
@@ -25,7 +25,7 @@ class SettingsManager:
|
|
|
25
25
|
*,
|
|
26
26
|
default_value: _T,
|
|
27
27
|
description: str,
|
|
28
|
-
validate: Callable[[_T], None],
|
|
28
|
+
validate: Callable[[_T, Self], None],
|
|
29
29
|
option_type: object | None = None,
|
|
30
30
|
get_from_env: Callable[[str, _T], _T] = ...,
|
|
31
31
|
) -> None: ...
|
|
@@ -46,5 +46,6 @@ class _AnnDataSettingsManager(SettingsManager):
|
|
|
46
46
|
use_sparse_array_on_read: bool = False
|
|
47
47
|
min_rows_for_chunked_h5_copy: int = 1000
|
|
48
48
|
disallow_forward_slash_in_h5ad: bool = False
|
|
49
|
+
auto_shard_zarr_v3: bool = False
|
|
49
50
|
|
|
50
51
|
settings: _AnnDataSettingsManager
|
|
@@ -111,9 +111,9 @@ class CategoricalArray(XBackendArray, Generic[K]):
|
|
|
111
111
|
def categories(self) -> np.ndarray:
|
|
112
112
|
if isinstance(self._categories, ZarrArray):
|
|
113
113
|
return self._categories[...]
|
|
114
|
-
from
|
|
114
|
+
from anndata.io import read_elem
|
|
115
115
|
|
|
116
|
-
return
|
|
116
|
+
return read_elem(self._categories)
|
|
117
117
|
|
|
118
118
|
def __getitem__(
|
|
119
119
|
self, key: xr.core.indexing.ExplicitIndexer
|
|
@@ -14,6 +14,7 @@ import h5py
|
|
|
14
14
|
import numpy as np
|
|
15
15
|
import pandas as pd
|
|
16
16
|
import pytest
|
|
17
|
+
import zarr
|
|
17
18
|
from pandas.api.types import is_numeric_dtype
|
|
18
19
|
from scipy import sparse
|
|
19
20
|
|
|
@@ -34,6 +35,7 @@ from anndata.compat import (
|
|
|
34
35
|
XDataArray,
|
|
35
36
|
XDataset,
|
|
36
37
|
ZarrArray,
|
|
38
|
+
ZarrGroup,
|
|
37
39
|
is_zarr_v2,
|
|
38
40
|
)
|
|
39
41
|
from anndata.utils import asarray
|
|
@@ -1187,3 +1189,23 @@ def get_multiindex_columns_df(shape: tuple[int, int]) -> pd.DataFrame:
|
|
|
1187
1189
|
+ list(itertools.product(["b"], range(shape[1] // 2)))
|
|
1188
1190
|
),
|
|
1189
1191
|
)
|
|
1192
|
+
|
|
1193
|
+
|
|
1194
|
+
def visititems_zarr(
|
|
1195
|
+
z: ZarrGroup, visitor: Callable[[str, ZarrGroup | zarr.Array], None]
|
|
1196
|
+
) -> None:
|
|
1197
|
+
for key in z:
|
|
1198
|
+
maybe_group = z[key]
|
|
1199
|
+
if isinstance(maybe_group, ZarrGroup):
|
|
1200
|
+
visititems_zarr(maybe_group, visitor)
|
|
1201
|
+
else:
|
|
1202
|
+
visitor(key, maybe_group)
|
|
1203
|
+
|
|
1204
|
+
|
|
1205
|
+
def check_all_sharded(g: ZarrGroup):
|
|
1206
|
+
def visit(key: str, arr: zarr.Array | zarr.Group):
|
|
1207
|
+
# Check for recarray via https://numpy.org/doc/stable/user/basics.rec.html#manipulating-and-displaying-structured-datatypes
|
|
1208
|
+
if isinstance(arr, zarr.Array) and arr.shape != () and arr.dtype.names is None:
|
|
1209
|
+
assert arr.shards is not None
|
|
1210
|
+
|
|
1211
|
+
visititems_zarr(g, visitor=visit)
|
|
@@ -8,12 +8,13 @@ import pandas as pd
|
|
|
8
8
|
import pytest
|
|
9
9
|
from scipy import sparse
|
|
10
10
|
|
|
11
|
-
from anndata import AnnData, concat
|
|
11
|
+
from anndata import AnnData, concat, settings
|
|
12
12
|
from anndata._core import merge
|
|
13
13
|
from anndata._core.merge import _resolve_axis
|
|
14
|
+
from anndata.compat import is_zarr_v2
|
|
14
15
|
from anndata.experimental.merge import as_group, concat_on_disk
|
|
15
16
|
from anndata.io import read_elem, write_elem
|
|
16
|
-
from anndata.tests.helpers import assert_equal, gen_adata
|
|
17
|
+
from anndata.tests.helpers import assert_equal, check_all_sharded, gen_adata
|
|
17
18
|
from anndata.utils import asarray
|
|
18
19
|
|
|
19
20
|
if TYPE_CHECKING:
|
|
@@ -230,7 +231,7 @@ def xxxm_adatas():
|
|
|
230
231
|
X=sparse.csr_matrix((2, 100)),
|
|
231
232
|
obs=pd.DataFrame(index=gen_index(2)),
|
|
232
233
|
obsm={
|
|
233
|
-
"sparse": np.arange(8).reshape(2, 4),
|
|
234
|
+
"sparse": sparse.csr_matrix(np.arange(8).reshape(2, 4)),
|
|
234
235
|
"dense": np.arange(4, 8).reshape(2, 2),
|
|
235
236
|
"df": pd.DataFrame(
|
|
236
237
|
{
|
|
@@ -253,6 +254,22 @@ def test_concatenate_xxxm(xxxm_adatas, tmp_path, file_format, join_type):
|
|
|
253
254
|
assert_eq_concat_on_disk(xxxm_adatas, tmp_path, file_format, join=join_type)
|
|
254
255
|
|
|
255
256
|
|
|
257
|
+
@pytest.mark.skipif(is_zarr_v2(), reason="auto sharding is allowed only for zarr v3.")
|
|
258
|
+
def test_concatenate_zarr_v3_shard(xxxm_adatas, tmp_path):
|
|
259
|
+
import zarr
|
|
260
|
+
|
|
261
|
+
with settings.override(auto_shard_zarr_v3=True, zarr_write_format=3):
|
|
262
|
+
assert_eq_concat_on_disk(xxxm_adatas, tmp_path, file_format="zarr")
|
|
263
|
+
g = zarr.open(tmp_path)
|
|
264
|
+
assert g.metadata.zarr_format == 3
|
|
265
|
+
|
|
266
|
+
def visit(key: str, arr: zarr.Array | zarr.Group):
|
|
267
|
+
if isinstance(arr, zarr.Array) and arr.shape != ():
|
|
268
|
+
assert arr.shards is not None
|
|
269
|
+
|
|
270
|
+
check_all_sharded(g)
|
|
271
|
+
|
|
272
|
+
|
|
256
273
|
def test_output_dir_exists(tmp_path):
|
|
257
274
|
in_pth = tmp_path / "in.h5ad"
|
|
258
275
|
out_pth = tmp_path / "does_not_exist" / "out.h5ad"
|
|
@@ -4,14 +4,12 @@ For tests using dask
|
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
6
6
|
|
|
7
|
-
from importlib.metadata import version
|
|
8
7
|
from pathlib import Path
|
|
9
8
|
from typing import TYPE_CHECKING
|
|
10
9
|
|
|
11
10
|
import numpy as np
|
|
12
11
|
import pandas as pd
|
|
13
12
|
import pytest
|
|
14
|
-
from packaging.version import Version
|
|
15
13
|
from scipy import sparse
|
|
16
14
|
|
|
17
15
|
import anndata as ad
|
|
@@ -25,6 +23,7 @@ from anndata.tests.helpers import (
|
|
|
25
23
|
as_dense_dask_array,
|
|
26
24
|
as_sparse_dask_array,
|
|
27
25
|
assert_equal,
|
|
26
|
+
check_all_sharded,
|
|
28
27
|
gen_adata,
|
|
29
28
|
)
|
|
30
29
|
|
|
@@ -111,12 +110,20 @@ def test_dask_write(adata, tmp_path, diskfmt):
|
|
|
111
110
|
|
|
112
111
|
@pytest.mark.xdist_group("dask")
|
|
113
112
|
@pytest.mark.dask_distributed
|
|
113
|
+
@pytest.mark.parametrize(
|
|
114
|
+
"auto_shard_zarr_v3",
|
|
115
|
+
[pytest.param(True, id="shard"), pytest.param(False, id="no-shard")],
|
|
116
|
+
)
|
|
114
117
|
def test_dask_distributed_write(
|
|
115
118
|
adata: AnnData,
|
|
116
119
|
tmp_path: Path,
|
|
117
120
|
diskfmt: Literal["h5ad", "zarr"],
|
|
118
121
|
local_cluster_addr: str,
|
|
122
|
+
*,
|
|
123
|
+
auto_shard_zarr_v3: bool,
|
|
119
124
|
) -> None:
|
|
125
|
+
if auto_shard_zarr_v3 and ad.settings.zarr_write_format == 2:
|
|
126
|
+
pytest.skip(reason="Cannot shard v2 data")
|
|
120
127
|
import dask.array as da
|
|
121
128
|
import dask.distributed as dd
|
|
122
129
|
import numpy as np
|
|
@@ -130,20 +137,12 @@ def test_dask_distributed_write(
|
|
|
130
137
|
adata.obsm["b"] = da.random.random((M, 10))
|
|
131
138
|
adata.varm["a"] = da.random.random((N, 10))
|
|
132
139
|
orig = adata
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
if is_corrupted_dask or is_h5:
|
|
136
|
-
with pytest.raises(
|
|
137
|
-
ValueError if is_h5 else RuntimeError,
|
|
138
|
-
match=r"Cannot write dask arrays to hdf5"
|
|
139
|
-
if is_h5
|
|
140
|
-
else r"Writing dense data with a distributed scheduler to zarr",
|
|
141
|
-
):
|
|
142
|
-
ad.io.write_elem(g, "", orig)
|
|
143
|
-
return
|
|
144
|
-
ad.io.write_elem(g, "", orig)
|
|
140
|
+
with ad.settings.override(auto_shard_zarr_v3=auto_shard_zarr_v3):
|
|
141
|
+
ad.io.write_elem(g, "", orig)
|
|
145
142
|
# TODO: See https://github.com/zarr-developers/zarr-python/issues/2716
|
|
146
143
|
g = as_group(pth, mode="r")
|
|
144
|
+
if auto_shard_zarr_v3:
|
|
145
|
+
check_all_sharded(g)
|
|
147
146
|
curr = ad.io.read_elem(g)
|
|
148
147
|
|
|
149
148
|
with pytest.raises(AssertionError):
|
|
@@ -72,7 +72,7 @@ def _alloc_cache():
|
|
|
72
72
|
# As of 2025.09.* dask, this needs a bit more than the previous 1.5mb.
|
|
73
73
|
# TODO: Why?
|
|
74
74
|
@pytest.mark.usefixtures("_alloc_cache")
|
|
75
|
-
@pytest.mark.limit_memory("
|
|
75
|
+
@pytest.mark.limit_memory("2.2 MB")
|
|
76
76
|
def test_size_of_view(mapping_name, give_chunks):
|
|
77
77
|
import dask.array as da
|
|
78
78
|
|
|
@@ -12,10 +12,14 @@ import anndata as ad
|
|
|
12
12
|
from anndata._io.zarr import open_write_group
|
|
13
13
|
from anndata.compat import CSArray, CSMatrix, ZarrGroup, is_zarr_v2
|
|
14
14
|
from anndata.experimental import read_dispatched, write_dispatched
|
|
15
|
-
from anndata.tests.helpers import
|
|
15
|
+
from anndata.tests.helpers import (
|
|
16
|
+
GEN_ADATA_NO_XARRAY_ARGS,
|
|
17
|
+
assert_equal,
|
|
18
|
+
gen_adata,
|
|
19
|
+
visititems_zarr,
|
|
20
|
+
)
|
|
16
21
|
|
|
17
22
|
if TYPE_CHECKING:
|
|
18
|
-
from collections.abc import Callable
|
|
19
23
|
from pathlib import Path
|
|
20
24
|
from typing import Literal
|
|
21
25
|
|
|
@@ -180,18 +184,7 @@ def test_write_dispatched_chunks(tmp_path: Path):
|
|
|
180
184
|
if is_zarr_v2():
|
|
181
185
|
z.visititems(check_chunking)
|
|
182
186
|
else:
|
|
183
|
-
|
|
184
|
-
def visititems(
|
|
185
|
-
z: ZarrGroup, visitor: Callable[[str, ZarrGroup | zarr.Array], None]
|
|
186
|
-
) -> None:
|
|
187
|
-
for key in z:
|
|
188
|
-
maybe_group = z[key]
|
|
189
|
-
if isinstance(maybe_group, ZarrGroup):
|
|
190
|
-
visititems(maybe_group, visitor)
|
|
191
|
-
else:
|
|
192
|
-
visitor(key, maybe_group)
|
|
193
|
-
|
|
194
|
-
visititems(z, check_chunking)
|
|
187
|
+
visititems_zarr(z, check_chunking)
|
|
195
188
|
|
|
196
189
|
|
|
197
190
|
@pytest.mark.zarr_io
|