anndata 0.12.3__tar.gz → 0.12.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {anndata-0.12.3 → anndata-0.12.4}/.github/workflows/test-cpu.yml +2 -2
- {anndata-0.12.3 → anndata-0.12.4}/.github/workflows/test-gpu.yml +1 -1
- {anndata-0.12.3 → anndata-0.12.4}/PKG-INFO +1 -1
- anndata-0.12.4/benchmarks/benchmarks/backed_hdf5.py +112 -0
- {anndata-0.12.3 → anndata-0.12.4}/benchmarks/benchmarks/dataset2d.py +21 -19
- {anndata-0.12.3 → anndata-0.12.4}/benchmarks/benchmarks/readwrite.py +12 -48
- {anndata-0.12.3 → anndata-0.12.4}/benchmarks/benchmarks/sparse_dataset.py +22 -15
- {anndata-0.12.3 → anndata-0.12.4}/benchmarks/benchmarks/utils.py +21 -3
- anndata-0.12.4/docs/release-notes/0.12.4.md +4 -0
- anndata-0.12.4/docs/release-notes/2172.bug.md +1 -0
- {anndata-0.12.3 → anndata-0.12.4}/hatch.toml +7 -3
- {anndata-0.12.3 → anndata-0.12.4}/pyproject.toml +5 -1
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/_core/aligned_df.py +7 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/_core/index.py +136 -23
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/_core/merge.py +6 -5
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/_core/sparse_dataset.py +4 -3
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/_io/specs/methods.py +16 -25
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/experimental/backed/_lazy_arrays.py +5 -2
- {anndata-0.12.3 → anndata-0.12.4}/tests/conftest.py +7 -4
- {anndata-0.12.3 → anndata-0.12.4}/tests/lazy/test_concat.py +1 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_annot.py +24 -1
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_backed_hdf5.py +102 -9
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_dask.py +14 -2
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_inplace_subset.py +1 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_io_elementwise.py +4 -1
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_readwrite.py +6 -9
- {anndata-0.12.3 → anndata-0.12.4}/.cirun.yml +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/.codecov.yml +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/.editorconfig +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/.github/ISSUE_TEMPLATE/bug-report.yml +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/.github/ISSUE_TEMPLATE/enhancement-request.yml +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/.github/ISSUE_TEMPLATE/question.yml +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/.github/dependabot.yml +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/.github/workflows/benchmark.yml +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/.github/workflows/check-pr.yml +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/.github/workflows/close-stale.yml +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/.github/workflows/codespell.yml +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/.github/workflows/label-stale.yml +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/.github/workflows/publish.yml +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/.gitignore +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/.gitmodules +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/.pre-commit-config.yaml +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/.prettierignore +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/.prettierrc.yaml +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/.readthedocs.yml +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/.taplo.toml +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/.vscode/launch.json +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/.vscode/settings.json +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/LICENSE +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/README.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/benchmarks/README.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/benchmarks/asv.conf.json +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/benchmarks/benchmarks/__init__.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/benchmarks/benchmarks/anndata.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/biome.jsonc +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/ci/constraints.txt +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/ci/scripts/min-deps.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/ci/scripts/towncrier_automation.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/Makefile +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/_key_contributors.rst +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/_static/img/anndata_schema.svg +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/_templates/autosummary/class.rst +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/api.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/benchmark-read-write.ipynb +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/benchmarks.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/concatenation.rst +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/conf.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/contributing.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/extensions/autosummary_skip_inherited.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/extensions/no_skip_abc_members.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/extensions/patch_myst_cite.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/fileformat-prose.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/index.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/interoperability.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/news.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/references.rst +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.10.0.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.10.1.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.10.2.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.10.3.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.10.4.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.10.5.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.10.6.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.10.7.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.10.8.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.10.9.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.11.0.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.11.1.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.11.2.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.11.3.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.11.4.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.12.0.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.12.1.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.12.2.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.12.3.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.4.0.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.5.0.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.6.0.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.6.x.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.7.0.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.7.2.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.7.3.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.7.4.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.7.5.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.7.6.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.7.7.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.7.8.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.8.0.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.9.0.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.9.1.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/0.9.2.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/release-notes/index.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/tutorials/index.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/docs/tutorials/zarr-v3.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/__init__.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/_core/__init__.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/_core/access.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/_core/aligned_mapping.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/_core/anndata.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/_core/extensions.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/_core/file_backing.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/_core/raw.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/_core/storage.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/_core/views.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/_core/xarray.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/_io/__init__.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/_io/h5ad.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/_io/read.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/_io/specs/__init__.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/_io/specs/lazy_methods.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/_io/specs/registry.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/_io/utils.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/_io/write.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/_io/zarr.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/_settings.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/_settings.pyi +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/_types.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/_warnings.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/abc.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/compat/__init__.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/experimental/__init__.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/experimental/_dispatch_io.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/experimental/backed/__init__.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/experimental/backed/_compat.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/experimental/backed/_io.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/experimental/merge.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/experimental/multi_files/__init__.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/experimental/multi_files/_anncollection.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/experimental/pytorch/__init__.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/experimental/pytorch/_annloader.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/io.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/logging.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/tests/__init__.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/tests/helpers.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/types.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/typing.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/anndata/utils.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/testing/anndata/__init__.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/testing/anndata/_doctest.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/testing/anndata/_pytest.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/src/testing/anndata/py.typed +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/data/adata-comments.tsv +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/data/adata.csv +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/data/archives/readme.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/data/archives/v0.11.4/adata.h5ad +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/data/archives/v0.11.4/adata.zarr.zip +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/data/archives/v0.11.4/readme.md +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/data/archives/v0.7.0/adata.h5ad +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/data/archives/v0.7.0/adata.zarr.zip +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/data/archives/v0.7.8/adata.h5ad +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/data/archives/v0.7.8/adata.zarr.zip +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/data/excel.xlsx +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/data/umi_tools.tsv.gz +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/lazy/conftest.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/lazy/test_read.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/lazy/test_write.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_anncollection.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_awkward.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_backed_dense.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_backed_sparse.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_base.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_concatenate.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_concatenate_disk.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_dask_view_mem.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_deprecations.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_extensions.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_get_vector.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_gpu.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_helpers.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_io_backwards_compat.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_io_conversion.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_io_dispatched.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_io_partial.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_io_utils.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_io_warnings.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_layers.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_obsmvarm.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_obspvarp.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_raw.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_repr.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_settings.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_structured_arrays.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_transpose.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_uns.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_utils.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_views.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_x.py +0 -0
- {anndata-0.12.3 → anndata-0.12.4}/tests/test_xarray.py +0 -0
|
@@ -43,7 +43,7 @@ jobs:
|
|
|
43
43
|
strategy:
|
|
44
44
|
matrix:
|
|
45
45
|
env: ${{ fromJSON(needs.get-environments.outputs.envs) }}
|
|
46
|
-
io_mark: ["zarr_io", "not zarr_io"]
|
|
46
|
+
io_mark: ["zarr_io", "not zarr_io", "dask_distributed"] # dask_distributed should not be run with -n auto as it uses a client with processes
|
|
47
47
|
env: # environment variables for use in codecov’s env_vars tagging
|
|
48
48
|
ENV_NAME: ${{ matrix.env.name }}
|
|
49
49
|
IO_MARK: ${{ matrix.io_mark }}
|
|
@@ -72,7 +72,7 @@ jobs:
|
|
|
72
72
|
env:
|
|
73
73
|
COVERAGE_PROCESS_START: ${{ github.workspace }}/pyproject.toml
|
|
74
74
|
run: |
|
|
75
|
-
hatch run ${{ matrix.env.name }}:run-cov -v --color=yes -n auto --junitxml=test-data/test-results.xml -m "${{ matrix.io_mark }}" ${{ matrix.env.args }}
|
|
75
|
+
hatch run ${{ matrix.env.name }}:run-cov -v --color=yes ${{ matrix.io_mark != 'dask_distributed' && '-n auto' || '' }} --junitxml=test-data/test-results.xml -m "${{ matrix.io_mark }}" ${{ matrix.env.args }}
|
|
76
76
|
hatch run ${{ matrix.env.name }}:cov-combine
|
|
77
77
|
hatch run ${{ matrix.env.name }}:coverage xml
|
|
78
78
|
|
|
@@ -63,7 +63,7 @@ jobs:
|
|
|
63
63
|
echo "max_python_version=$max_version" >> $GITHUB_ENV
|
|
64
64
|
|
|
65
65
|
- name: Install UV
|
|
66
|
-
uses: astral-sh/setup-uv@v6
|
|
66
|
+
uses: astral-sh/setup-uv@v6 # TODO: upgrade once cirun image supports node 24
|
|
67
67
|
with:
|
|
68
68
|
enable-cache: true
|
|
69
69
|
python-version: ${{ env.max_python_version }}
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from scipy import sparse
|
|
6
|
+
|
|
7
|
+
import anndata as ad
|
|
8
|
+
|
|
9
|
+
file_paths = {"sparse": "adata_sparse.h5ad"}
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class BackedHDF5Indexing:
|
|
13
|
+
param_names = ("arr_type",)
|
|
14
|
+
params = ("sparse",)
|
|
15
|
+
|
|
16
|
+
def setup_cache(self):
|
|
17
|
+
X_sparse = sparse.random(
|
|
18
|
+
10000,
|
|
19
|
+
50000,
|
|
20
|
+
density=0.01,
|
|
21
|
+
format="csr",
|
|
22
|
+
random_state=np.random.default_rng(42),
|
|
23
|
+
)
|
|
24
|
+
for X, arr_type in [
|
|
25
|
+
(X_sparse, "sparse"),
|
|
26
|
+
]:
|
|
27
|
+
n_obs, n_var = X.shape
|
|
28
|
+
|
|
29
|
+
# Create obs and var dataframes
|
|
30
|
+
obs = pd.DataFrame(
|
|
31
|
+
{
|
|
32
|
+
"cell_type": pd.Categorical(
|
|
33
|
+
np.random.choice(["TypeA", "TypeB", "TypeC"], n_obs)
|
|
34
|
+
),
|
|
35
|
+
"total_counts": np.random.randint(1000, 5000, n_obs),
|
|
36
|
+
},
|
|
37
|
+
index=[f"cell_{i}" for i in range(n_obs)],
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
var = pd.DataFrame(
|
|
41
|
+
{
|
|
42
|
+
"gene_name": [f"gene_{i}" for i in range(n_var)],
|
|
43
|
+
},
|
|
44
|
+
index=[f"ENSG_{i:08d}" for i in range(n_var)],
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# Create AnnData object and save to HDF5
|
|
48
|
+
adata = ad.AnnData(X=X, obs=obs, var=var)
|
|
49
|
+
|
|
50
|
+
# Create temporary file
|
|
51
|
+
adata.write_h5ad(file_paths[arr_type])
|
|
52
|
+
|
|
53
|
+
def setup(self, arr_type):
|
|
54
|
+
# Open as backed
|
|
55
|
+
self.adata_backed = ad.read_h5ad(file_paths[arr_type], backed="r")
|
|
56
|
+
self.n_obs, self.n_var = self.adata_backed.shape
|
|
57
|
+
# Prepare indices for duplicate index testing
|
|
58
|
+
self.obs_idx_with_dupes = np.array([0, 1, 0, 2, 1] * (self.n_obs // 100 + 1))[
|
|
59
|
+
: (self.n_obs // 10)
|
|
60
|
+
]
|
|
61
|
+
self.var_idx_with_dupes = np.array([0, 1, 2, 0, 3] * (self.n_var // 100 + 1))[
|
|
62
|
+
: (self.n_var // 10)
|
|
63
|
+
]
|
|
64
|
+
self.obs_idx_no_dupes = np.arange(0, self.n_obs, 10)
|
|
65
|
+
self.var_idx_no_dupes = np.arange(0, self.n_var, 10)
|
|
66
|
+
|
|
67
|
+
def time_slice_obs(self, *_):
|
|
68
|
+
"""Time slicing observations from backed HDF5"""
|
|
69
|
+
self.adata_backed[0 : (self.n_obs // 2), :]
|
|
70
|
+
|
|
71
|
+
def time_slice_obs_to_memory(self, *_):
|
|
72
|
+
"""Time slicing observations from backed HDF5"""
|
|
73
|
+
self.adata_backed[0 : (self.n_obs // 2), :].to_memory()
|
|
74
|
+
|
|
75
|
+
def peakmem_slice_obs(self, *_):
|
|
76
|
+
"""Peak memory for slicing observations from backed HDF5"""
|
|
77
|
+
self.adata_backed[0 : (self.n_obs // 2), :]
|
|
78
|
+
|
|
79
|
+
def time_fancy_index_no_dupes(self, *_):
|
|
80
|
+
"""Time fancy indexing without duplicates"""
|
|
81
|
+
self.adata_backed[self.obs_idx_no_dupes, self.var_idx_no_dupes]
|
|
82
|
+
|
|
83
|
+
def peakmem_fancy_index_no_dupes(self, *_):
|
|
84
|
+
"""Peak memory for fancy indexing without duplicates"""
|
|
85
|
+
self.adata_backed[self.obs_idx_no_dupes, self.var_idx_no_dupes]
|
|
86
|
+
|
|
87
|
+
def time_fancy_index_no_dupes_to_memory(self, *_):
|
|
88
|
+
"""Time fancy indexing without duplicates"""
|
|
89
|
+
self.adata_backed[self.obs_idx_no_dupes, self.var_idx_no_dupes].to_memory()
|
|
90
|
+
|
|
91
|
+
def time_index_with_dupes_obs(self, *_):
|
|
92
|
+
"""Time fancy indexing with duplicate observation indices"""
|
|
93
|
+
self.adata_backed[self.obs_idx_with_dupes, :]
|
|
94
|
+
|
|
95
|
+
def peakmem_index_with_dupes_obs(self, *_):
|
|
96
|
+
"""Peak memory for fancy indexing with duplicate observation indices"""
|
|
97
|
+
self.adata_backed[self.obs_idx_with_dupes, :]
|
|
98
|
+
|
|
99
|
+
def time_to_memory_subset(self, *_):
|
|
100
|
+
"""Time converting subset to memory"""
|
|
101
|
+
subset = self.adata_backed[0 : (self.n_obs // 4), 0 : (self.n_var // 4)]
|
|
102
|
+
subset.to_memory()
|
|
103
|
+
|
|
104
|
+
def peakmem_to_memory_subset(self, *_):
|
|
105
|
+
"""Peak memory for converting subset to memory"""
|
|
106
|
+
subset = self.adata_backed[0 : (self.n_obs // 4), 0 : (self.n_var // 4)]
|
|
107
|
+
subset.to_memory()
|
|
108
|
+
|
|
109
|
+
def teardown(self, *_):
|
|
110
|
+
"""Clean up temporary files"""
|
|
111
|
+
if hasattr(self, "adata_backed"):
|
|
112
|
+
self.adata_backed.file.close()
|
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import tempfile
|
|
4
|
-
from pathlib import Path
|
|
5
3
|
from typing import TYPE_CHECKING
|
|
6
4
|
|
|
7
5
|
import h5py
|
|
@@ -12,35 +10,39 @@ import zarr
|
|
|
12
10
|
import anndata as ad
|
|
13
11
|
|
|
14
12
|
if TYPE_CHECKING:
|
|
15
|
-
from
|
|
13
|
+
from typing import Literal
|
|
16
14
|
|
|
17
15
|
|
|
18
16
|
class Dataset2D:
|
|
19
|
-
param_names = ("
|
|
17
|
+
param_names = ("store_type", "chunks")
|
|
20
18
|
params = (
|
|
21
|
-
(
|
|
22
|
-
lambda: h5py.File(Path(tempfile.mkdtemp()) / "data.h5ad", mode="w"),
|
|
23
|
-
lambda: zarr.open(
|
|
24
|
-
Path(tempfile.mkdtemp()) / "data.zarr", mode="w", zarr_version=2
|
|
25
|
-
),
|
|
26
|
-
),
|
|
19
|
+
("zarr", "h5ad"),
|
|
27
20
|
((-1,), None),
|
|
28
21
|
)
|
|
29
22
|
|
|
30
|
-
def
|
|
31
|
-
|
|
32
|
-
):
|
|
33
|
-
self.n_obs = 100000
|
|
23
|
+
def setup_cache(self):
|
|
24
|
+
n_obs = 100000
|
|
34
25
|
df = pd.DataFrame(
|
|
35
26
|
{
|
|
36
|
-
"a": pd.Categorical(np.array(["a"] *
|
|
37
|
-
"b": np.arange(
|
|
27
|
+
"a": pd.Categorical(np.array(["a"] * n_obs)),
|
|
28
|
+
"b": np.arange(n_obs),
|
|
38
29
|
},
|
|
39
|
-
index=[f"cell{i}" for i in range(
|
|
30
|
+
index=[f"cell{i}" for i in range(n_obs)],
|
|
31
|
+
)
|
|
32
|
+
for store in [
|
|
33
|
+
h5py.File("data.h5ad", mode="w"),
|
|
34
|
+
zarr.open("data.zarr", mode="w", zarr_version=2),
|
|
35
|
+
]:
|
|
36
|
+
ad.io.write_elem(store, "obs", df)
|
|
37
|
+
|
|
38
|
+
def setup(self, store_type: Literal["zarr", "h5ad"], chunks: None | tuple[int]):
|
|
39
|
+
store = (
|
|
40
|
+
h5py.File("data.h5ad", mode="r")
|
|
41
|
+
if store_type == "h5ad"
|
|
42
|
+
else zarr.open("data.zarr")
|
|
40
43
|
)
|
|
41
|
-
store = gen_store()
|
|
42
|
-
ad.io.write_elem(store, "obs", df)
|
|
43
44
|
self.ds = ad.experimental.read_elem_lazy(store["obs"], chunks=chunks)
|
|
45
|
+
self.n_obs = self.ds.shape[0]
|
|
44
46
|
|
|
45
47
|
def time_getitem_slice(self, *_):
|
|
46
48
|
self.ds.iloc[0 : (self.n_obs // 2)].to_memory()
|
|
@@ -38,52 +38,15 @@ from .utils import get_actualsize, get_peak_mem, sedate
|
|
|
38
38
|
|
|
39
39
|
PBMC_3K_URL = "https://falexwolf.de/data/pbmc3k_raw.h5ad"
|
|
40
40
|
|
|
41
|
-
# PBMC_3K_PATH = Path(__file__).parent / "data/pbmc3k_raw.h5ad"
|
|
42
|
-
# PBMC_REDUCED_PATH = Path(__file__).parent / "10x_pbmc68k_reduced.h5ad"
|
|
43
|
-
# BM_43K_CSR_PATH = Path(__file__).parent.parent / "datasets/BM2_43k-cells.h5ad"
|
|
44
|
-
# BM_43K_CSC_PATH = Path(__file__).parent.parent / "datasets/BM2_43k-cells_CSC.h5ad"
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
# class ZarrReadSuite:
|
|
48
|
-
# params = []
|
|
49
|
-
# param_names = ["input_url"]
|
|
50
|
-
|
|
51
|
-
# def setup(self, input_url):
|
|
52
|
-
# self.filepath = pooch.retrieve(url=input_url, known_hash=None)
|
|
53
|
-
|
|
54
|
-
# def time_read_full(self, input_url):
|
|
55
|
-
# anndata.read_zarr(self.filepath)
|
|
56
|
-
|
|
57
|
-
# def peakmem_read_full(self, input_url):
|
|
58
|
-
# anndata.read_zarr(self.filepath)
|
|
59
|
-
|
|
60
|
-
# def mem_readfull_object(self, input_url):
|
|
61
|
-
# return anndata.read_zarr(self.filepath)
|
|
62
|
-
|
|
63
|
-
# def track_read_full_memratio(self, input_url):
|
|
64
|
-
# mem_recording = memory_usage(
|
|
65
|
-
# (sedate(anndata.read_zarr, 0.005), (self.filepath,)), interval=0.001
|
|
66
|
-
# )
|
|
67
|
-
# adata = anndata.read_zarr(self.filepath)
|
|
68
|
-
# base_size = mem_recording[-1] - mem_recording[0]
|
|
69
|
-
# print(np.max(mem_recording) - np.min(mem_recording))
|
|
70
|
-
# print(base_size)
|
|
71
|
-
# return (np.max(mem_recording) - np.min(mem_recording)) / base_size
|
|
72
|
-
|
|
73
|
-
# def peakmem_read_backed(self, input_url):
|
|
74
|
-
# anndata.read_zarr(self.filepath, backed="r")
|
|
75
|
-
|
|
76
|
-
# def mem_read_backed_object(self, input_url):
|
|
77
|
-
# return anndata.read_zarr(self.filepath, backed="r")
|
|
78
|
-
|
|
79
41
|
|
|
80
42
|
class H5ADInMemorySizeSuite:
|
|
81
|
-
|
|
82
|
-
params = _urls.keys()
|
|
83
|
-
param_names = ("input_data",)
|
|
43
|
+
filepath = "pbmc_in_mem.h5ad"
|
|
84
44
|
|
|
85
|
-
def
|
|
86
|
-
|
|
45
|
+
def setup_cache(self):
|
|
46
|
+
# Need to specify path because the working directory is special for asv
|
|
47
|
+
pooch.retrieve(
|
|
48
|
+
url=PBMC_3K_URL, known_hash=None, path=Path.cwd(), fname=self.filepath
|
|
49
|
+
)
|
|
87
50
|
|
|
88
51
|
def track_in_memory_size(self, *_):
|
|
89
52
|
adata = anndata.read_h5ad(self.filepath)
|
|
@@ -99,12 +62,13 @@ class H5ADInMemorySizeSuite:
|
|
|
99
62
|
|
|
100
63
|
|
|
101
64
|
class H5ADReadSuite:
|
|
102
|
-
|
|
103
|
-
params = _urls.keys()
|
|
104
|
-
param_names = ("input_data",)
|
|
65
|
+
filepath = "pbmc_read.h5ad"
|
|
105
66
|
|
|
106
|
-
def
|
|
107
|
-
|
|
67
|
+
def setup_cache(self):
|
|
68
|
+
# Need to specify path because the working directory is special for asv
|
|
69
|
+
pooch.retrieve(
|
|
70
|
+
url=PBMC_3K_URL, known_hash=None, path=Path.cwd(), fname=self.filepath
|
|
71
|
+
)
|
|
108
72
|
|
|
109
73
|
def time_read_full(self, *_):
|
|
110
74
|
anndata.read_h5ad(self.filepath)
|
|
@@ -21,7 +21,7 @@ def make_alternating_mask(n):
|
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
class SparseCSRContiguousSlice:
|
|
24
|
-
|
|
24
|
+
_indexers = MappingProxyType({
|
|
25
25
|
"0:1000": slice(0, 1000),
|
|
26
26
|
"0:9000": slice(0, 9000),
|
|
27
27
|
":9000:-1": slice(None, 9000, -1),
|
|
@@ -31,42 +31,49 @@ class SparseCSRContiguousSlice:
|
|
|
31
31
|
"first": 0,
|
|
32
32
|
"alternating": make_alternating_mask(10),
|
|
33
33
|
})
|
|
34
|
+
filepath = "data.zarr"
|
|
34
35
|
params = (
|
|
35
|
-
|
|
36
|
-
(10_000, 10_000),
|
|
37
|
-
# (10_000, 500)
|
|
38
|
-
],
|
|
39
|
-
_slices.keys(),
|
|
36
|
+
list(_indexers.keys()),
|
|
40
37
|
[True, False],
|
|
41
38
|
)
|
|
42
|
-
param_names = (
|
|
39
|
+
param_names = (
|
|
40
|
+
"index",
|
|
41
|
+
"use_dask",
|
|
42
|
+
)
|
|
43
43
|
|
|
44
|
-
def
|
|
44
|
+
def setup_cache(self):
|
|
45
45
|
X = sparse.random(
|
|
46
|
-
|
|
46
|
+
10_000,
|
|
47
|
+
10_000,
|
|
48
|
+
density=0.01,
|
|
49
|
+
format="csr",
|
|
50
|
+
random_state=np.random.default_rng(42),
|
|
47
51
|
)
|
|
48
|
-
|
|
49
|
-
g = zarr.group()
|
|
52
|
+
g = zarr.group(self.filepath)
|
|
50
53
|
write_elem(g, "X", X)
|
|
54
|
+
|
|
55
|
+
def setup(self, index: str, use_dask: bool): # noqa: FBT001
|
|
56
|
+
g = zarr.open(self.filepath)
|
|
51
57
|
self.x = read_elem_lazy(g["X"]) if use_dask else sparse_dataset(g["X"])
|
|
52
58
|
self.adata = AnnData(self.x)
|
|
59
|
+
self.index = self._indexers[index]
|
|
53
60
|
|
|
54
61
|
def time_getitem(self, *_):
|
|
55
|
-
res = self.x[self.
|
|
62
|
+
res = self.x[self.index]
|
|
56
63
|
if isinstance(res, DaskArray):
|
|
57
64
|
res.compute()
|
|
58
65
|
|
|
59
66
|
def peakmem_getitem(self, *_):
|
|
60
|
-
res = self.x[self.
|
|
67
|
+
res = self.x[self.index]
|
|
61
68
|
if isinstance(res, DaskArray):
|
|
62
69
|
res.compute()
|
|
63
70
|
|
|
64
71
|
def time_getitem_adata(self, *_):
|
|
65
|
-
res = self.adata[self.
|
|
72
|
+
res = self.adata[self.index]
|
|
66
73
|
if isinstance(res, DaskArray):
|
|
67
74
|
res.compute()
|
|
68
75
|
|
|
69
76
|
def peakmem_getitem_adata(self, *_):
|
|
70
|
-
res = self.adata[self.
|
|
77
|
+
res = self.adata[self.index]
|
|
71
78
|
if isinstance(res, DaskArray):
|
|
72
79
|
res.compute()
|
|
@@ -95,13 +95,31 @@ def gen_indexer(adata, dim, index_kind, ratio):
|
|
|
95
95
|
|
|
96
96
|
def gen_adata(n_obs, n_var, attr_set):
|
|
97
97
|
if "X-csr" in attr_set:
|
|
98
|
-
X = sparse.random(
|
|
98
|
+
X = sparse.random(
|
|
99
|
+
n_obs,
|
|
100
|
+
n_var,
|
|
101
|
+
density=0.1,
|
|
102
|
+
format="csr",
|
|
103
|
+
random_state=np.random.default_rng(42),
|
|
104
|
+
)
|
|
99
105
|
elif "X-dense" in attr_set:
|
|
100
|
-
X = sparse.random(
|
|
106
|
+
X = sparse.random(
|
|
107
|
+
n_obs,
|
|
108
|
+
n_var,
|
|
109
|
+
density=0.1,
|
|
110
|
+
format="csr",
|
|
111
|
+
random_state=np.random.default_rng(42),
|
|
112
|
+
)
|
|
101
113
|
X = X.toarray()
|
|
102
114
|
else:
|
|
103
115
|
# TODO: There's probably a better way to do this
|
|
104
|
-
X = sparse.random(
|
|
116
|
+
X = sparse.random(
|
|
117
|
+
n_obs,
|
|
118
|
+
n_var,
|
|
119
|
+
density=0,
|
|
120
|
+
format="csr",
|
|
121
|
+
random_state=np.random.default_rng(42),
|
|
122
|
+
)
|
|
105
123
|
adata = AnnData(X)
|
|
106
124
|
if "obs,var" in attr_set:
|
|
107
125
|
adata.obs = pd.DataFrame(
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{func}`dask.array.store` was producing corrupted data with zarr v3 + distributed scheduler + a lock (which we used internally): see {ref}`dask/dask#12109`. Thus dense arrays were potentially being stored with corrupted data. The solution is to remove the lock for newer versions of dask but without the lock in older versions, it is impossible to store the data. Thus versions of dask older than `2025.4.0` will not be supported for writing dense data. {user}`ilan-gold`
|
|
@@ -24,9 +24,13 @@ overrides.matrix.deps.env-vars = [
|
|
|
24
24
|
{ if = [ "min" ], key = "UV_CONSTRAINT", value = "ci/constraints.txt ci/min-deps.txt" },
|
|
25
25
|
]
|
|
26
26
|
overrides.matrix.deps.pre-install-commands = [
|
|
27
|
-
{ if = [
|
|
28
|
-
|
|
29
|
-
|
|
27
|
+
{ if = [
|
|
28
|
+
"min",
|
|
29
|
+
], value = "uv run ci/scripts/min-deps.py pyproject.toml --all-extras -o ci/min-deps.txt" },
|
|
30
|
+
# To prevent situations like https://github.com/pydata/xarray/issues/10419 going forward, and test against zarr as well
|
|
31
|
+
{ if = [
|
|
32
|
+
"pre",
|
|
33
|
+
], value = "echo 'xarray @ git+https://github.com/pydata/xarray.git\nzarr @ git+https://github.com/zarr-developers/zarr-python.git' > ci/pre-deps.txt" },
|
|
30
34
|
|
|
31
35
|
]
|
|
32
36
|
overrides.matrix.deps.python = [
|
|
@@ -174,7 +174,11 @@ testpaths = [
|
|
|
174
174
|
]
|
|
175
175
|
# For some reason this effects how logging is shown when tests are run
|
|
176
176
|
xfail_strict = true
|
|
177
|
-
markers = [
|
|
177
|
+
markers = [
|
|
178
|
+
"gpu: mark test to run on GPU",
|
|
179
|
+
"zarr_io: mark tests that involve zarr io",
|
|
180
|
+
"dask_distributed: tests that need a distributed client with multiple processes",
|
|
181
|
+
]
|
|
178
182
|
|
|
179
183
|
[tool.ruff]
|
|
180
184
|
src = [ "src" ]
|
|
@@ -78,6 +78,13 @@ def _gen_dataframe_df(
|
|
|
78
78
|
attr: Literal["obs", "var"],
|
|
79
79
|
length: int | None = None,
|
|
80
80
|
):
|
|
81
|
+
if isinstance(anno.index, pd.MultiIndex):
|
|
82
|
+
msg = (
|
|
83
|
+
"pandas.MultiIndex not supported as index for obs or var on declaration.\n\
|
|
84
|
+
You can set `obs_names` manually although most operations after will error or convert to str.\n\
|
|
85
|
+
This behavior will likely be clarified in a future breaking release."
|
|
86
|
+
)
|
|
87
|
+
raise ValueError(msg)
|
|
81
88
|
if length is not None and length != len(anno):
|
|
82
89
|
raise _mk_df_error(source, attr, length, len(anno))
|
|
83
90
|
anno = anno.copy(deep=False)
|
|
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
from collections.abc import Iterable, Sequence
|
|
4
4
|
from functools import singledispatch
|
|
5
5
|
from itertools import repeat
|
|
6
|
-
from typing import TYPE_CHECKING
|
|
6
|
+
from typing import TYPE_CHECKING, cast, overload
|
|
7
7
|
|
|
8
8
|
import h5py
|
|
9
9
|
import numpy as np
|
|
@@ -14,6 +14,8 @@ from ..compat import AwkArray, CSArray, CSMatrix, DaskArray, XDataArray
|
|
|
14
14
|
from .xarray import Dataset2D
|
|
15
15
|
|
|
16
16
|
if TYPE_CHECKING:
|
|
17
|
+
from numpy.typing import NDArray
|
|
18
|
+
|
|
17
19
|
from ..compat import Index, Index1D, Index1DNorm
|
|
18
20
|
|
|
19
21
|
|
|
@@ -161,7 +163,10 @@ def unpack_index(index: Index) -> tuple[Index1D, Index1D]:
|
|
|
161
163
|
|
|
162
164
|
|
|
163
165
|
@singledispatch
|
|
164
|
-
def _subset(
|
|
166
|
+
def _subset(
|
|
167
|
+
a: np.ndarray | pd.DataFrame,
|
|
168
|
+
subset_idx: tuple[Index1DNorm] | tuple[Index1DNorm, Index1DNorm],
|
|
169
|
+
):
|
|
165
170
|
# Select as combination of indexes, not coordinates
|
|
166
171
|
# Correcting for indexing behaviour of np.ndarray
|
|
167
172
|
if all(isinstance(x, Iterable) for x in subset_idx):
|
|
@@ -170,7 +175,9 @@ def _subset(a: np.ndarray | pd.DataFrame, subset_idx: Index):
|
|
|
170
175
|
|
|
171
176
|
|
|
172
177
|
@_subset.register(DaskArray)
|
|
173
|
-
def _subset_dask(
|
|
178
|
+
def _subset_dask(
|
|
179
|
+
a: DaskArray, subset_idx: tuple[Index1DNorm] | tuple[Index1DNorm, Index1DNorm]
|
|
180
|
+
):
|
|
174
181
|
if len(subset_idx) > 1 and all(isinstance(x, Iterable) for x in subset_idx):
|
|
175
182
|
if issparse(a._meta) and a._meta.format == "csc":
|
|
176
183
|
return a[:, subset_idx[1]][subset_idx[0], :]
|
|
@@ -180,24 +187,32 @@ def _subset_dask(a: DaskArray, subset_idx: Index):
|
|
|
180
187
|
|
|
181
188
|
@_subset.register(CSMatrix)
|
|
182
189
|
@_subset.register(CSArray)
|
|
183
|
-
def _subset_sparse(
|
|
190
|
+
def _subset_sparse(
|
|
191
|
+
a: CSMatrix | CSArray,
|
|
192
|
+
subset_idx: tuple[Index1DNorm] | tuple[Index1DNorm, Index1DNorm],
|
|
193
|
+
):
|
|
184
194
|
# Correcting for indexing behaviour of sparse.spmatrix
|
|
185
195
|
if len(subset_idx) > 1 and all(isinstance(x, Iterable) for x in subset_idx):
|
|
186
196
|
first_idx = subset_idx[0]
|
|
187
197
|
if issubclass(first_idx.dtype.type, np.bool_):
|
|
188
|
-
first_idx = np.
|
|
198
|
+
first_idx = np.flatnonzero(first_idx)
|
|
189
199
|
subset_idx = (first_idx.reshape(-1, 1), *subset_idx[1:])
|
|
190
200
|
return a[subset_idx]
|
|
191
201
|
|
|
192
202
|
|
|
193
203
|
@_subset.register(pd.DataFrame)
|
|
194
204
|
@_subset.register(Dataset2D)
|
|
195
|
-
def _subset_df(
|
|
205
|
+
def _subset_df(
|
|
206
|
+
df: pd.DataFrame | Dataset2D,
|
|
207
|
+
subset_idx: tuple[Index1DNorm] | tuple[Index1DNorm, Index1DNorm],
|
|
208
|
+
):
|
|
196
209
|
return df.iloc[subset_idx]
|
|
197
210
|
|
|
198
211
|
|
|
199
212
|
@_subset.register(AwkArray)
|
|
200
|
-
def _subset_awkarray(
|
|
213
|
+
def _subset_awkarray(
|
|
214
|
+
a: AwkArray, subset_idx: tuple[Index1DNorm] | tuple[Index1DNorm, Index1DNorm]
|
|
215
|
+
):
|
|
201
216
|
if all(isinstance(x, Iterable) for x in subset_idx):
|
|
202
217
|
subset_idx = np.ix_(*subset_idx)
|
|
203
218
|
return a[subset_idx]
|
|
@@ -205,23 +220,121 @@ def _subset_awkarray(a: AwkArray, subset_idx: Index):
|
|
|
205
220
|
|
|
206
221
|
# Registration for SparseDataset occurs in sparse_dataset.py
|
|
207
222
|
@_subset.register(h5py.Dataset)
|
|
208
|
-
def _subset_dataset(
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
223
|
+
def _subset_dataset(
|
|
224
|
+
d: h5py.Dataset, subset_idx: tuple[Index1DNorm] | tuple[Index1DNorm, Index1DNorm]
|
|
225
|
+
):
|
|
226
|
+
order: tuple[NDArray[np.integer] | slice, ...]
|
|
227
|
+
inv_order: tuple[NDArray[np.integer] | slice, ...]
|
|
228
|
+
order, inv_order = zip(*map(_index_order_and_inverse, subset_idx), strict=True)
|
|
229
|
+
# check for duplicates or multi-dimensional fancy indexing
|
|
230
|
+
array_dims = [i for i in order if isinstance(i, np.ndarray)]
|
|
231
|
+
has_duplicates = any(len(np.unique(i)) != len(i) for i in array_dims)
|
|
232
|
+
# Use safe indexing if there are duplicates OR multiple array dimensions
|
|
233
|
+
# (h5py doesn't support multi-dimensional fancy indexing natively)
|
|
234
|
+
if has_duplicates or len(array_dims) > 1:
|
|
235
|
+
# For multi-dimensional indexing, bypass the sorting logic and use original indices
|
|
236
|
+
return _safe_fancy_index_h5py(d, subset_idx)
|
|
220
237
|
# from hdf5, then to real order
|
|
221
|
-
return d[
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
238
|
+
return d[order][inv_order]
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
@overload
|
|
242
|
+
def _index_order_and_inverse(
|
|
243
|
+
axis_idx: NDArray[np.integer] | NDArray[np.bool_],
|
|
244
|
+
) -> tuple[NDArray[np.integer], NDArray[np.integer]]: ...
|
|
245
|
+
@overload
|
|
246
|
+
def _index_order_and_inverse(axis_idx: slice) -> tuple[slice, slice]: ...
|
|
247
|
+
def _index_order_and_inverse(
|
|
248
|
+
axis_idx: Index1DNorm,
|
|
249
|
+
) -> tuple[Index1DNorm, NDArray[np.integer] | slice]:
|
|
250
|
+
"""Order and get inverse index array."""
|
|
251
|
+
if not isinstance(axis_idx, np.ndarray):
|
|
252
|
+
return axis_idx, slice(None)
|
|
253
|
+
if axis_idx.dtype == bool:
|
|
254
|
+
axis_idx = np.flatnonzero(axis_idx)
|
|
255
|
+
order = np.argsort(axis_idx)
|
|
256
|
+
return axis_idx[order], np.argsort(order)
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
@overload
|
|
260
|
+
def _process_index_for_h5py(
|
|
261
|
+
idx: NDArray[np.integer] | NDArray[np.bool_],
|
|
262
|
+
) -> tuple[NDArray[np.integer], NDArray[np.integer]]: ...
|
|
263
|
+
@overload
|
|
264
|
+
def _process_index_for_h5py(idx: slice) -> tuple[slice, None]: ...
|
|
265
|
+
def _process_index_for_h5py(
|
|
266
|
+
idx: Index1DNorm,
|
|
267
|
+
) -> tuple[Index1DNorm, NDArray[np.integer] | None]:
|
|
268
|
+
"""Process a single index for h5py compatibility, handling sorting and duplicates."""
|
|
269
|
+
if not isinstance(idx, np.ndarray):
|
|
270
|
+
# Not an array (slice, integer, list) - no special processing needed
|
|
271
|
+
return idx, None
|
|
272
|
+
|
|
273
|
+
if idx.dtype == bool:
|
|
274
|
+
idx = np.flatnonzero(idx)
|
|
275
|
+
|
|
276
|
+
# For h5py fancy indexing, we need sorted indices
|
|
277
|
+
# But we also need to track how to reverse the sorting
|
|
278
|
+
unique, inverse = np.unique(idx, return_inverse=True)
|
|
279
|
+
return (
|
|
280
|
+
# Has duplicates - use unique + inverse mapping approach
|
|
281
|
+
(unique, inverse)
|
|
282
|
+
if len(unique) != len(idx)
|
|
283
|
+
# No duplicates - just sort and track reverse mapping
|
|
284
|
+
else _index_order_and_inverse(idx)
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def _safe_fancy_index_h5py(
|
|
289
|
+
dataset: h5py.Dataset,
|
|
290
|
+
subset_idx: tuple[Index1DNorm] | tuple[Index1DNorm, Index1DNorm],
|
|
291
|
+
) -> h5py.Dataset:
|
|
292
|
+
# Handle multi-dimensional indexing of h5py dataset
|
|
293
|
+
# This avoids h5py's limitation with multi-dimensional fancy indexing
|
|
294
|
+
# without loading the entire dataset into memory
|
|
295
|
+
|
|
296
|
+
# Convert boolean arrays to integer arrays and handle sorting for h5py
|
|
297
|
+
processed_indices: tuple[NDArray[np.integer] | slice, ...]
|
|
298
|
+
reverse_indices: tuple[NDArray[np.integer] | None, ...]
|
|
299
|
+
processed_indices, reverse_indices = zip(
|
|
300
|
+
*map(_process_index_for_h5py, subset_idx), strict=True
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
# First find the index that reduces the size of the dataset the most
|
|
304
|
+
i_min = np.argmin([
|
|
305
|
+
_get_index_size(inds, dataset.shape[i]) / dataset.shape[i]
|
|
306
|
+
for i, inds in enumerate(processed_indices)
|
|
307
|
+
])
|
|
308
|
+
|
|
309
|
+
# Apply the most selective index first to h5py dataset
|
|
310
|
+
first_index = [slice(None)] * len(processed_indices)
|
|
311
|
+
first_index[i_min] = processed_indices[i_min]
|
|
312
|
+
in_memory_array = cast("np.ndarray", dataset[tuple(first_index)])
|
|
313
|
+
|
|
314
|
+
# Apply remaining indices to the numpy array
|
|
315
|
+
remaining_indices = list(processed_indices)
|
|
316
|
+
remaining_indices[i_min] = slice(None) # Already applied
|
|
317
|
+
result = in_memory_array[tuple(remaining_indices)]
|
|
318
|
+
|
|
319
|
+
# Now apply reverse mappings to get the original order
|
|
320
|
+
for dim, reverse_map in enumerate(reverse_indices):
|
|
321
|
+
if reverse_map is not None:
|
|
322
|
+
result = result.take(reverse_map, axis=dim)
|
|
323
|
+
|
|
324
|
+
return result
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def _get_index_size(idx: Index1DNorm, dim_size: int) -> int:
|
|
328
|
+
"""Get size for any index type."""
|
|
329
|
+
if isinstance(idx, slice):
|
|
330
|
+
return len(range(*idx.indices(dim_size)))
|
|
331
|
+
elif isinstance(idx, int):
|
|
332
|
+
return 1
|
|
333
|
+
else: # For other types, try to get length
|
|
334
|
+
return len(idx)
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def make_slice(idx, dimidx: int, n: int = 2) -> tuple[slice, ...]:
|
|
225
338
|
mut = list(repeat(slice(None), n))
|
|
226
339
|
mut[dimidx] = idx
|
|
227
340
|
return tuple(mut)
|