cellarr-array 0.0.3__tar.gz → 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cellarr_array-0.1.0/.github/workflows/run-tests.yml +73 -0
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/CHANGELOG.md +6 -0
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/LICENSE.txt +1 -1
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/PKG-INFO +1 -1
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/src/cellarr_array/__init__.py +2 -2
- cellarr_array-0.1.0/src/cellarr_array/cellarray_base.py +344 -0
- cellarr_array-0.0.3/src/cellarr_array/DenseCellArray.py → cellarr_array-0.1.0/src/cellarr_array/cellarray_dense.py +1 -1
- cellarr_array-0.0.3/src/cellarr_array/SparseCellArray.py → cellarr_array-0.1.0/src/cellarr_array/cellarray_sparse.py +66 -13
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/src/cellarr_array/helpers.py +7 -3
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/src/cellarr_array.egg-info/PKG-INFO +1 -1
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/src/cellarr_array.egg-info/SOURCES.txt +3 -3
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/tests/test_all.py +1 -1
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/tests/test_dense.py +16 -1
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/tests/test_inmemory.py +11 -2
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/tests/test_sparse.py +21 -6
- cellarr_array-0.0.3/.github/workflows/run-tests.yml +0 -33
- cellarr_array-0.0.3/src/cellarr_array/CellArray.py +0 -251
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/.coveragerc +0 -0
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/.github/workflows/publish-pypi.yml +0 -0
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/.gitignore +0 -0
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/.pre-commit-config.yaml +0 -0
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/.readthedocs.yml +0 -0
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/AUTHORS.md +0 -0
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/CONTRIBUTING.md +0 -0
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/README.md +0 -0
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/docs/Makefile +0 -0
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/docs/_static/.gitignore +0 -0
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/docs/authors.md +0 -0
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/docs/changelog.md +0 -0
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/docs/conf.py +0 -0
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/docs/contributing.md +0 -0
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/docs/index.md +0 -0
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/docs/license.md +0 -0
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/docs/readme.md +0 -0
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/docs/requirements.txt +0 -0
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/pyproject.toml +0 -0
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/setup.cfg +0 -0
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/setup.py +0 -0
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/src/cellarr_array/config.py +0 -0
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/src/cellarr_array.egg-info/dependency_links.txt +0 -0
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/src/cellarr_array.egg-info/not-zip-safe +0 -0
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/src/cellarr_array.egg-info/requires.txt +0 -0
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/src/cellarr_array.egg-info/top_level.txt +0 -0
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/tests/conftest.py +0 -0
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/tests/test_helpers.py +0 -0
- {cellarr_array-0.0.3 → cellarr_array-0.1.0}/tox.ini +0 -0
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
name: Test the library
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches:
|
|
6
|
+
- master # for legacy repos
|
|
7
|
+
- main
|
|
8
|
+
pull_request:
|
|
9
|
+
branches:
|
|
10
|
+
- master # for legacy repos
|
|
11
|
+
- main
|
|
12
|
+
workflow_dispatch: # Allow manually triggering the workflow
|
|
13
|
+
schedule:
|
|
14
|
+
# Run roughly every 15 days at 00:00 UTC
|
|
15
|
+
# (useful to check if updates on dependencies break the package)
|
|
16
|
+
- cron: "0 0 1,16 * *"
|
|
17
|
+
|
|
18
|
+
permissions:
|
|
19
|
+
contents: read
|
|
20
|
+
|
|
21
|
+
concurrency:
|
|
22
|
+
group: >-
|
|
23
|
+
${{ github.workflow }}-${{ github.ref_type }}-
|
|
24
|
+
${{ github.event.pull_request.number || github.sha }}
|
|
25
|
+
cancel-in-progress: true
|
|
26
|
+
|
|
27
|
+
jobs:
|
|
28
|
+
test:
|
|
29
|
+
strategy:
|
|
30
|
+
matrix:
|
|
31
|
+
python: ["3.9", "3.10", "3.11", "3.12", "3.13"]
|
|
32
|
+
platform:
|
|
33
|
+
- ubuntu-latest
|
|
34
|
+
# - macos-latest
|
|
35
|
+
# - windows-latest
|
|
36
|
+
runs-on: ${{ matrix.platform }}
|
|
37
|
+
name: Python ${{ matrix.python }}, ${{ matrix.platform }}
|
|
38
|
+
steps:
|
|
39
|
+
- uses: actions/checkout@v4
|
|
40
|
+
|
|
41
|
+
- uses: actions/setup-python@v5
|
|
42
|
+
id: setup-python
|
|
43
|
+
with:
|
|
44
|
+
python-version: ${{ matrix.python }}
|
|
45
|
+
|
|
46
|
+
- name: Install dependencies
|
|
47
|
+
run: |
|
|
48
|
+
python -m pip install --upgrade pip
|
|
49
|
+
pip install tox coverage
|
|
50
|
+
|
|
51
|
+
- name: Run tests
|
|
52
|
+
run: >-
|
|
53
|
+
pipx run --python '${{ steps.setup-python.outputs.python-path }}'
|
|
54
|
+
tox
|
|
55
|
+
-- -rFEx --durations 10 --color yes --cov --cov-branch --cov-report=xml # pytest args
|
|
56
|
+
|
|
57
|
+
- name: Check for codecov token availability
|
|
58
|
+
id: codecov-check
|
|
59
|
+
shell: bash
|
|
60
|
+
run: |
|
|
61
|
+
if [ ${{ secrets.CODECOV_TOKEN }} != '' ]; then
|
|
62
|
+
echo "codecov=true" >> $GITHUB_OUTPUT;
|
|
63
|
+
else
|
|
64
|
+
echo "codecov=false" >> $GITHUB_OUTPUT;
|
|
65
|
+
fi
|
|
66
|
+
|
|
67
|
+
- name: Upload coverage reports to Codecov with GitHub Action
|
|
68
|
+
uses: codecov/codecov-action@v5
|
|
69
|
+
if: ${{ steps.codecov-check.outputs.codecov == 'true' }}
|
|
70
|
+
env:
|
|
71
|
+
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
|
|
72
|
+
slug: ${{ github.repository }}
|
|
73
|
+
flags: ${{ matrix.platform }} - py${{ matrix.python }}
|
|
@@ -1,5 +1,11 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## Version 0.1.0
|
|
4
|
+
|
|
5
|
+
- Support cellarr-arrays on user provided tiledb array objects.
|
|
6
|
+
- Migrate github actions to the newer version from biocsetup.
|
|
7
|
+
- Renaming module names, documentation and tests
|
|
8
|
+
|
|
3
9
|
## Version 0.0.2
|
|
4
10
|
|
|
5
11
|
- Support in-memory tiledb objects. Updated tests and documentation.
|
|
@@ -16,6 +16,6 @@ finally:
|
|
|
16
16
|
del version, PackageNotFoundError
|
|
17
17
|
|
|
18
18
|
from .config import CellArrConfig, ConsolidationConfig
|
|
19
|
-
from .
|
|
20
|
-
from .
|
|
19
|
+
from .cellarray_dense import DenseCellArray
|
|
20
|
+
from .cellarray_sparse import SparseCellArray
|
|
21
21
|
from .helpers import create_cellarray, SliceHelper
|
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
|
|
4
|
+
try:
|
|
5
|
+
from types import EllipsisType
|
|
6
|
+
except ImportError:
|
|
7
|
+
# TODO: This is required for Python <3.10. Remove once Python 3.9 reaches EOL in October 2025
|
|
8
|
+
EllipsisType = type(...)
|
|
9
|
+
from typing import Any, List, Literal, Optional, Tuple, Union
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
import tiledb
|
|
13
|
+
from scipy import sparse
|
|
14
|
+
|
|
15
|
+
from .config import ConsolidationConfig
|
|
16
|
+
from .helpers import SliceHelper
|
|
17
|
+
|
|
18
|
+
__author__ = "Jayaram Kancherla"
|
|
19
|
+
__copyright__ = "Jayaram Kancherla"
|
|
20
|
+
__license__ = "MIT"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class CellArray(ABC):
|
|
24
|
+
"""Abstract base class for TileDB array operations."""
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
uri: Optional[str] = None,
|
|
29
|
+
tiledb_array_obj: Optional[tiledb.Array] = None,
|
|
30
|
+
attr: str = "data",
|
|
31
|
+
mode: Optional[Literal["r", "w", "d", "m"]] = None,
|
|
32
|
+
config_or_context: Optional[Union[tiledb.Config, tiledb.Ctx]] = None,
|
|
33
|
+
validate: bool = True,
|
|
34
|
+
):
|
|
35
|
+
"""Initialize the object.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
uri:
|
|
39
|
+
URI to the array.
|
|
40
|
+
Required if 'tiledb_array_obj' is not provided.
|
|
41
|
+
|
|
42
|
+
tiledb_array_obj:
|
|
43
|
+
Optional, an already opened ``tiledb.Array`` instance.
|
|
44
|
+
If provided, 'uri' can be None, and 'config_or_context' is ignored.
|
|
45
|
+
|
|
46
|
+
attr:
|
|
47
|
+
Attribute to access.
|
|
48
|
+
Defaults to "data".
|
|
49
|
+
|
|
50
|
+
mode:
|
|
51
|
+
Open the array object in read 'r', write 'w', modify
|
|
52
|
+
'm' mode, or delete 'd' mode.
|
|
53
|
+
|
|
54
|
+
Defaults to None for automatic mode switching.
|
|
55
|
+
|
|
56
|
+
If 'tiledb_array_obj' is provided, this mode should ideally match
|
|
57
|
+
the mode of the provided array or be None.
|
|
58
|
+
|
|
59
|
+
config_or_context:
|
|
60
|
+
Optional config or context object. Ignored if 'tiledb_array_obj' is provided,
|
|
61
|
+
as context will be derived from the object.
|
|
62
|
+
|
|
63
|
+
Defaults to None.
|
|
64
|
+
|
|
65
|
+
validate:
|
|
66
|
+
Whether to validate the attributes.
|
|
67
|
+
Defaults to True.
|
|
68
|
+
"""
|
|
69
|
+
self._array_passed_in = False
|
|
70
|
+
self._opened_array_external = None
|
|
71
|
+
self._ctx = None
|
|
72
|
+
|
|
73
|
+
if tiledb_array_obj is not None:
|
|
74
|
+
if not isinstance(tiledb_array_obj, tiledb.Array):
|
|
75
|
+
raise ValueError("'tiledb_array_obj' must be a tiledb.Array instance.")
|
|
76
|
+
|
|
77
|
+
if not tiledb_array_obj.isopen:
|
|
78
|
+
# Option 1: Raise error
|
|
79
|
+
raise ValueError("If 'tiledb_array_obj' is provided, it must be an open tiledb.Array instance.")
|
|
80
|
+
# Option 2: Try to reopen (less safe as we don't know original intent)
|
|
81
|
+
# try:
|
|
82
|
+
# tiledb_array_obj.reopen()
|
|
83
|
+
# except tiledb.TileDBError as e:
|
|
84
|
+
# raise ValueError(
|
|
85
|
+
# f"Provided 'tiledb_array_obj' is closed and could not be reopened: {e}"
|
|
86
|
+
# )
|
|
87
|
+
|
|
88
|
+
self.uri = tiledb_array_obj.uri
|
|
89
|
+
self._array_passed_in = True
|
|
90
|
+
self._opened_array_external = tiledb_array_obj
|
|
91
|
+
|
|
92
|
+
# infer mode if possible, or require it matches
|
|
93
|
+
if mode is not None and tiledb_array_obj.mode != mode:
|
|
94
|
+
# we could try to reopen with the desired mode
|
|
95
|
+
raise ValueError(
|
|
96
|
+
f"Provided array mode '{tiledb_array_obj.mode}' does not match requested mode '{mode}'.",
|
|
97
|
+
"Re-open the external array with the desired mode or pass matching mode.",
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
self._mode = tiledb_array_obj.mode
|
|
101
|
+
self._ctx = tiledb_array_obj.ctx
|
|
102
|
+
elif uri is not None:
|
|
103
|
+
self.uri = uri
|
|
104
|
+
self._mode = mode
|
|
105
|
+
self._array_passed_in = False
|
|
106
|
+
self._opened_array_external = None
|
|
107
|
+
|
|
108
|
+
if config_or_context is None:
|
|
109
|
+
self._ctx = None
|
|
110
|
+
elif isinstance(config_or_context, tiledb.Config):
|
|
111
|
+
self._ctx = tiledb.Ctx(config_or_context)
|
|
112
|
+
elif isinstance(config_or_context, tiledb.Ctx):
|
|
113
|
+
self._ctx = config_or_context
|
|
114
|
+
else:
|
|
115
|
+
raise TypeError("'config_or_context' must be a TileDB Config or Ctx object.")
|
|
116
|
+
else:
|
|
117
|
+
raise ValueError("Either 'uri' or 'tiledb_array_obj' must be provided.")
|
|
118
|
+
|
|
119
|
+
self._shape = None
|
|
120
|
+
self._ndim = None
|
|
121
|
+
self._dim_names = None
|
|
122
|
+
self._attr_names = None
|
|
123
|
+
self._nonempty_domain = None
|
|
124
|
+
|
|
125
|
+
if validate:
|
|
126
|
+
self._validate(attr=attr)
|
|
127
|
+
|
|
128
|
+
self._attr = attr
|
|
129
|
+
|
|
130
|
+
def _validate(self, attr):
|
|
131
|
+
with self.open_array(mode="r") as A:
|
|
132
|
+
schema = A.schema
|
|
133
|
+
if schema.ndim > 2:
|
|
134
|
+
raise ValueError("Only 1D and 2D arrays are supported.")
|
|
135
|
+
|
|
136
|
+
current_attr_names = [schema.attr(i).name for i in range(schema.nattr)]
|
|
137
|
+
if attr not in current_attr_names:
|
|
138
|
+
raise ValueError(
|
|
139
|
+
f"Attribute '{attr}' does not exist in the array. Available attributes: {current_attr_names}."
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
@property
|
|
143
|
+
def mode(self) -> Optional[str]:
|
|
144
|
+
"""Get current array mode. If an external array is used, this is its open mode."""
|
|
145
|
+
if self._array_passed_in and self._opened_array_external is not None:
|
|
146
|
+
return self._opened_array_external.mode
|
|
147
|
+
return self._mode
|
|
148
|
+
|
|
149
|
+
@mode.setter
|
|
150
|
+
def mode(self, value: Optional[str]):
|
|
151
|
+
"""Set array mode for subsequent operations if not using an external array.
|
|
152
|
+
|
|
153
|
+
This action does not affect an already passed-in external array's mode.
|
|
154
|
+
"""
|
|
155
|
+
if self._array_passed_in:
|
|
156
|
+
# To change mode of an external array, user must reopen it and pass it again.
|
|
157
|
+
current_ext_mode = self._opened_array_external.mode if self._opened_array_external else "unknown"
|
|
158
|
+
if value != current_ext_mode:
|
|
159
|
+
raise ValueError(
|
|
160
|
+
f"Cannot change mode of an externally managed array (current: {current_ext_mode}). "
|
|
161
|
+
"Re-open the external array with the new mode and re-initialize CellArray."
|
|
162
|
+
)
|
|
163
|
+
if value is not None and value not in ["r", "w", "m", "d"]:
|
|
164
|
+
raise ValueError("Mode must be one of: None, 'r', 'w', 'm', 'd'")
|
|
165
|
+
|
|
166
|
+
self._mode = value
|
|
167
|
+
|
|
168
|
+
@property
|
|
169
|
+
def dim_names(self) -> List[str]:
|
|
170
|
+
"""Get dimension names of the array."""
|
|
171
|
+
if self._dim_names is None:
|
|
172
|
+
with self.open_array(mode="r") as A:
|
|
173
|
+
self._dim_names = [dim.name for dim in A.schema.domain]
|
|
174
|
+
return self._dim_names
|
|
175
|
+
|
|
176
|
+
@property
|
|
177
|
+
def attr_names(self) -> List[str]:
|
|
178
|
+
"""Get attribute names of the array."""
|
|
179
|
+
if self._attr_names is None:
|
|
180
|
+
with self.open_array(mode="r") as A:
|
|
181
|
+
self._attr_names = [A.schema.attr(i).name for i in range(A.schema.nattr)]
|
|
182
|
+
return self._attr_names
|
|
183
|
+
|
|
184
|
+
@property
|
|
185
|
+
def shape(self) -> Tuple[int, ...]:
|
|
186
|
+
if self._shape is None:
|
|
187
|
+
with self.open_array(mode="r") as A:
|
|
188
|
+
self._shape = tuple(int(dim.domain[1] - dim.domain[0] + 1) for dim in A.schema.domain)
|
|
189
|
+
return self._shape
|
|
190
|
+
|
|
191
|
+
@property
|
|
192
|
+
def nonempty_domain(self) -> Optional[Tuple[Any, ...]]:
|
|
193
|
+
if self._nonempty_domain is None:
|
|
194
|
+
with self.open_array(mode="r") as A:
|
|
195
|
+
# nonempty_domain() can return None if the array is empty.
|
|
196
|
+
ned = A.nonempty_domain()
|
|
197
|
+
if ned is None:
|
|
198
|
+
self._nonempty_domain = None
|
|
199
|
+
else:
|
|
200
|
+
self._nonempty_domain = tuple(ned) if isinstance(ned[0], tuple) else (ned,)
|
|
201
|
+
return self._nonempty_domain
|
|
202
|
+
|
|
203
|
+
@property
|
|
204
|
+
def ndim(self) -> int:
|
|
205
|
+
"""Get number of dimensions."""
|
|
206
|
+
if self._ndim is None:
|
|
207
|
+
with self.open_array(mode="r") as A:
|
|
208
|
+
self._ndim = A.schema.ndim
|
|
209
|
+
# self._ndim = len(self.shape)
|
|
210
|
+
return self._ndim
|
|
211
|
+
|
|
212
|
+
@contextmanager
|
|
213
|
+
def open_array(self, mode: Optional[str] = None):
|
|
214
|
+
"""Context manager for array operations.
|
|
215
|
+
|
|
216
|
+
Uses the externally provided array if available, otherwise opens from URI.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
mode:
|
|
220
|
+
Desired mode for the operation ('r', 'w', 'm', 'd').
|
|
221
|
+
If an external array is used, this mode must be compatible with
|
|
222
|
+
(or same as) the mode the external array was opened with.
|
|
223
|
+
|
|
224
|
+
If None, uses the CellArray's default mode.
|
|
225
|
+
"""
|
|
226
|
+
if self._array_passed_in and self._opened_array_external is not None:
|
|
227
|
+
if not self._opened_array_external.isopen:
|
|
228
|
+
# Attempt to reopen if closed. This assumes the user might have closed it
|
|
229
|
+
# and expects CellArr to reopen it if still possible.
|
|
230
|
+
try:
|
|
231
|
+
self._opened_array_external.reopen()
|
|
232
|
+
except Exception as e:
|
|
233
|
+
raise tiledb.TileDBError(
|
|
234
|
+
f"Externally provided array is closed and could not be reopened: {e}"
|
|
235
|
+
) from e
|
|
236
|
+
|
|
237
|
+
effective_mode = mode if mode is not None else self._opened_array_external.mode
|
|
238
|
+
|
|
239
|
+
current_external_mode = self._opened_array_external.mode
|
|
240
|
+
if effective_mode == "r" and current_external_mode not in ["r", "w", "m"]:
|
|
241
|
+
# Read ops ok on write/modify modes
|
|
242
|
+
pass
|
|
243
|
+
elif effective_mode in ["w", "d"] and current_external_mode != effective_mode:
|
|
244
|
+
raise tiledb.TileDBError(
|
|
245
|
+
f"Requested operation mode '{effective_mode}' is incompatible with the "
|
|
246
|
+
f"externally provided array's mode '{current_external_mode}'. "
|
|
247
|
+
"Ensure the external array is opened in a compatible mode."
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
# DO NOT close self._opened_array_external here; its lifecycle is managed by the user.
|
|
251
|
+
yield self._opened_array_external
|
|
252
|
+
else:
|
|
253
|
+
effective_mode = mode if mode is not None else self.mode
|
|
254
|
+
effective_mode = effective_mode if effective_mode is not None else "r"
|
|
255
|
+
array = tiledb.open(self.uri, mode=effective_mode, ctx=self._ctx)
|
|
256
|
+
|
|
257
|
+
try:
|
|
258
|
+
yield array
|
|
259
|
+
finally:
|
|
260
|
+
array.close()
|
|
261
|
+
|
|
262
|
+
def __getitem__(self, key: Union[slice, EllipsisType, Tuple[Union[slice, List[int]], ...], EllipsisType]):
|
|
263
|
+
"""Get item implementation that routes to either direct slicing or multi_index
|
|
264
|
+
based on the type of indices provided.
|
|
265
|
+
|
|
266
|
+
Args:
|
|
267
|
+
key:
|
|
268
|
+
Slice or list of indices for each dimension in the array.
|
|
269
|
+
"""
|
|
270
|
+
if not isinstance(key, tuple):
|
|
271
|
+
key = (key,)
|
|
272
|
+
|
|
273
|
+
if len(key) > self.ndim:
|
|
274
|
+
raise IndexError(f"Invalid number of dimensions: got {len(key)}, expected {self.ndim}")
|
|
275
|
+
|
|
276
|
+
# Normalize all indices
|
|
277
|
+
normalized_key = tuple(SliceHelper.normalize_index(idx, self.shape[i]) for i, idx in enumerate(key))
|
|
278
|
+
|
|
279
|
+
num_ellipsis = sum(isinstance(i, EllipsisType) for i in normalized_key)
|
|
280
|
+
if num_ellipsis > 1:
|
|
281
|
+
raise IndexError(f"Found more than 1 Ellipsis (...) in key: {normalized_key}")
|
|
282
|
+
|
|
283
|
+
# Check if we can use direct slicing
|
|
284
|
+
use_direct = all(isinstance(idx, (slice, EllipsisType)) for idx in normalized_key)
|
|
285
|
+
|
|
286
|
+
if use_direct:
|
|
287
|
+
return self._direct_slice(normalized_key)
|
|
288
|
+
else:
|
|
289
|
+
if num_ellipsis > 0:
|
|
290
|
+
raise IndexError(f"tiledb does not support ellipsis in multi-index access: {normalized_key}")
|
|
291
|
+
return self._multi_index(normalized_key)
|
|
292
|
+
|
|
293
|
+
@abstractmethod
|
|
294
|
+
def _direct_slice(self, key: Tuple[Union[slice, EllipsisType], ...]) -> np.ndarray:
|
|
295
|
+
"""Implementation for direct slicing."""
|
|
296
|
+
pass
|
|
297
|
+
|
|
298
|
+
@abstractmethod
|
|
299
|
+
def _multi_index(self, key: Tuple[Union[slice, List[int]], ...]) -> np.ndarray:
|
|
300
|
+
"""Implementation for multi-index access."""
|
|
301
|
+
pass
|
|
302
|
+
|
|
303
|
+
def vacuum(self) -> None:
|
|
304
|
+
"""Remove deleted fragments from the array."""
|
|
305
|
+
tiledb.vacuum(self.uri)
|
|
306
|
+
|
|
307
|
+
def consolidate(self, config: Optional[ConsolidationConfig] = None) -> None:
|
|
308
|
+
"""Consolidate array fragments.
|
|
309
|
+
|
|
310
|
+
Args:
|
|
311
|
+
config:
|
|
312
|
+
Optional consolidation configuration.
|
|
313
|
+
"""
|
|
314
|
+
if config is None:
|
|
315
|
+
config = ConsolidationConfig()
|
|
316
|
+
|
|
317
|
+
consolidation_cfg = tiledb.Config()
|
|
318
|
+
|
|
319
|
+
consolidation_cfg["sm.consolidation.steps"] = config.steps
|
|
320
|
+
consolidation_cfg["sm.consolidation.step_min_frags"] = config.step_min_frags
|
|
321
|
+
consolidation_cfg["sm.consolidation.step_max_frags"] = config.step_max_frags
|
|
322
|
+
consolidation_cfg["sm.consolidation.buffer_size"] = config.buffer_size
|
|
323
|
+
consolidation_cfg["sm.mem.total_budget"] = config.total_budget
|
|
324
|
+
|
|
325
|
+
tiledb.consolidate(self.uri, config=consolidation_cfg)
|
|
326
|
+
|
|
327
|
+
if config.vacuum_after:
|
|
328
|
+
self.vacuum()
|
|
329
|
+
|
|
330
|
+
@abstractmethod
|
|
331
|
+
def write_batch(self, data: Union[np.ndarray, sparse.spmatrix], start_row: int, **kwargs) -> None:
|
|
332
|
+
"""Write a batch of data to the array starting at the specified row.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
data:
|
|
336
|
+
Data to write (numpy array for dense, scipy sparse matrix for sparse).
|
|
337
|
+
|
|
338
|
+
start_row:
|
|
339
|
+
Starting row index for writing.
|
|
340
|
+
|
|
341
|
+
**kwargs:
|
|
342
|
+
Additional arguments for write operation.
|
|
343
|
+
"""
|
|
344
|
+
pass
|
|
@@ -3,13 +3,13 @@ try:
|
|
|
3
3
|
except ImportError:
|
|
4
4
|
# TODO: This is required for Python <3.10. Remove once Python 3.9 reaches EOL in October 2025
|
|
5
5
|
EllipsisType = type(...)
|
|
6
|
-
from typing import Dict, List, Optional, Tuple, Union
|
|
6
|
+
from typing import Dict, List, Literal, Optional, Tuple, Union
|
|
7
7
|
|
|
8
8
|
import numpy as np
|
|
9
9
|
import tiledb
|
|
10
10
|
from scipy import sparse
|
|
11
11
|
|
|
12
|
-
from .
|
|
12
|
+
from .cellarray_base import CellArray
|
|
13
13
|
from .helpers import SliceHelper
|
|
14
14
|
|
|
15
15
|
__author__ = "Jayaram Kancherla"
|
|
@@ -22,15 +22,68 @@ class SparseCellArray(CellArray):
|
|
|
22
22
|
|
|
23
23
|
def __init__(
|
|
24
24
|
self,
|
|
25
|
-
uri: str,
|
|
25
|
+
uri: Optional[str] = None,
|
|
26
|
+
tiledb_array_obj: Optional[tiledb.Array] = None,
|
|
26
27
|
attr: str = "data",
|
|
27
|
-
mode:
|
|
28
|
+
mode: Optional[Literal["r", "w", "d", "m"]] = None,
|
|
28
29
|
config_or_context: Optional[Union[tiledb.Config, tiledb.Ctx]] = None,
|
|
29
30
|
return_sparse: bool = True,
|
|
30
31
|
sparse_coerce: Union[sparse.csr_matrix, sparse.csc_matrix] = sparse.csr_matrix,
|
|
32
|
+
validate: bool = True,
|
|
33
|
+
**kwargs,
|
|
31
34
|
):
|
|
32
|
-
"""Initialize
|
|
33
|
-
|
|
35
|
+
"""Initialize the object.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
uri:
|
|
39
|
+
URI to the array.
|
|
40
|
+
Required if 'tiledb_array_obj' is not provided.
|
|
41
|
+
|
|
42
|
+
tiledb_array_obj:
|
|
43
|
+
Optional, an already opened ``tiledb.Array`` instance.
|
|
44
|
+
If provided, 'uri' can be None, and 'config_or_context' is ignored.
|
|
45
|
+
|
|
46
|
+
attr:
|
|
47
|
+
Attribute to access.
|
|
48
|
+
Defaults to "data".
|
|
49
|
+
|
|
50
|
+
mode:
|
|
51
|
+
Open the array object in read 'r', write 'w', modify
|
|
52
|
+
'm' mode, or delete 'd' mode.
|
|
53
|
+
|
|
54
|
+
Defaults to None for automatic mode switching.
|
|
55
|
+
|
|
56
|
+
If 'tiledb_array_obj' is provided, this mode should ideally match
|
|
57
|
+
the mode of the provided array or be None.
|
|
58
|
+
|
|
59
|
+
config_or_context:
|
|
60
|
+
Optional config or context object. Ignored if 'tiledb_array_obj' is provided,
|
|
61
|
+
as context will be derived from the object.
|
|
62
|
+
|
|
63
|
+
Defaults to None.
|
|
64
|
+
|
|
65
|
+
return_sparse:
|
|
66
|
+
Whether to return a sparse representation of the data when object is sliced.
|
|
67
|
+
Default is to return a dictionary that contains coordinates and values.
|
|
68
|
+
|
|
69
|
+
sparse_coerce:
|
|
70
|
+
Format to return, defaults to csr_matrix.
|
|
71
|
+
|
|
72
|
+
validate:
|
|
73
|
+
Whether to validate the attributes.
|
|
74
|
+
Defaults to True.
|
|
75
|
+
|
|
76
|
+
kwargs:
|
|
77
|
+
Additional arguments.
|
|
78
|
+
"""
|
|
79
|
+
super().__init__(
|
|
80
|
+
uri=uri,
|
|
81
|
+
tiledb_array_obj=tiledb_array_obj,
|
|
82
|
+
attr=attr,
|
|
83
|
+
mode=mode,
|
|
84
|
+
config_or_context=config_or_context,
|
|
85
|
+
validate=validate,
|
|
86
|
+
)
|
|
34
87
|
|
|
35
88
|
self.return_sparse = return_sparse
|
|
36
89
|
self.sparse_coerce = sparse.csr_matrix if sparse_coerce is None else sparse_coerce
|
|
@@ -187,21 +240,21 @@ class SparseCellArray(CellArray):
|
|
|
187
240
|
raise TypeError("Input must be a scipy sparse matrix.")
|
|
188
241
|
|
|
189
242
|
# Validate and adjust dimensions
|
|
190
|
-
|
|
243
|
+
coo_data, is_1d = self._validate_matrix_dims(data)
|
|
191
244
|
|
|
192
245
|
# Check bounds
|
|
193
|
-
end_row = start_row +
|
|
246
|
+
end_row = start_row + coo_data.shape[0]
|
|
194
247
|
if end_row > self.shape[0]:
|
|
195
248
|
raise ValueError(
|
|
196
249
|
f"Write operation would exceed array bounds. End row {end_row} > array rows {self.shape[0]}."
|
|
197
250
|
)
|
|
198
251
|
|
|
199
|
-
if not is_1d and
|
|
200
|
-
raise ValueError(f"Data columns {
|
|
252
|
+
if not is_1d and coo_data.shape[1] != self.shape[1]:
|
|
253
|
+
raise ValueError(f"Data columns {coo_data.shape[1]} don't match array columns {self.shape[1]}.")
|
|
201
254
|
|
|
202
|
-
adjusted_rows =
|
|
255
|
+
adjusted_rows = coo_data.row + start_row
|
|
203
256
|
with self.open_array(mode="w") as array:
|
|
204
257
|
if is_1d:
|
|
205
|
-
array[adjusted_rows] =
|
|
258
|
+
array[adjusted_rows] = coo_data.data
|
|
206
259
|
else:
|
|
207
|
-
array[adjusted_rows,
|
|
260
|
+
array[adjusted_rows, coo_data.col] = coo_data.data
|
|
@@ -133,11 +133,15 @@ def create_cellarray(
|
|
|
133
133
|
tiledb.Array.create(uri, schema)
|
|
134
134
|
|
|
135
135
|
# Import here to avoid circular imports
|
|
136
|
-
from .
|
|
137
|
-
from .
|
|
136
|
+
from .cellarray_dense import DenseCellArray
|
|
137
|
+
from .cellarray_sparse import SparseCellArray
|
|
138
138
|
|
|
139
139
|
# Return appropriate array type
|
|
140
|
-
return
|
|
140
|
+
return (
|
|
141
|
+
SparseCellArray(uri=uri, attr=attr_name, mode=mode)
|
|
142
|
+
if sparse
|
|
143
|
+
else DenseCellArray(uri=uri, attr=attr_name, mode=mode)
|
|
144
|
+
)
|
|
141
145
|
|
|
142
146
|
|
|
143
147
|
class SliceHelper:
|
|
@@ -23,10 +23,10 @@ docs/license.md
|
|
|
23
23
|
docs/readme.md
|
|
24
24
|
docs/requirements.txt
|
|
25
25
|
docs/_static/.gitignore
|
|
26
|
-
src/cellarr_array/CellArray.py
|
|
27
|
-
src/cellarr_array/DenseCellArray.py
|
|
28
|
-
src/cellarr_array/SparseCellArray.py
|
|
29
26
|
src/cellarr_array/__init__.py
|
|
27
|
+
src/cellarr_array/cellarray_base.py
|
|
28
|
+
src/cellarr_array/cellarray_dense.py
|
|
29
|
+
src/cellarr_array/cellarray_sparse.py
|
|
30
30
|
src/cellarr_array/config.py
|
|
31
31
|
src/cellarr_array/helpers.py
|
|
32
32
|
src/cellarr_array.egg-info/PKG-INFO
|
|
@@ -24,7 +24,7 @@ def test_attribute_validation(temp_dir):
|
|
|
24
24
|
create_cellarray(uri=uri, shape=(10, 10), attr_dtype=np.float32, attr_name="values")
|
|
25
25
|
|
|
26
26
|
with pytest.raises(ValueError, match="Attribute 'invalid' does not exist"):
|
|
27
|
-
DenseCellArray(uri, attr="invalid")
|
|
27
|
+
DenseCellArray(uri=uri, attr="invalid")
|
|
28
28
|
|
|
29
29
|
|
|
30
30
|
def test_1d_integration(temp_dir):
|
|
@@ -2,6 +2,7 @@ from pathlib import Path
|
|
|
2
2
|
|
|
3
3
|
import numpy as np
|
|
4
4
|
import pytest
|
|
5
|
+
import tiledb
|
|
5
6
|
|
|
6
7
|
from cellarr_array import DenseCellArray, create_cellarray
|
|
7
8
|
|
|
@@ -172,10 +173,24 @@ def test_invalid_operations(sample_dense_array_2d):
|
|
|
172
173
|
sample_dense_array_2d.mode = "invalid"
|
|
173
174
|
|
|
174
175
|
with pytest.raises(ValueError, match="Attribute .* does not exist"):
|
|
175
|
-
DenseCellArray(sample_dense_array_2d.uri, attr="invalid_attr")
|
|
176
|
+
DenseCellArray(uri=sample_dense_array_2d.uri, attr="invalid_attr")
|
|
176
177
|
|
|
177
178
|
with pytest.raises(IndexError, match="Invalid number of dimensions"):
|
|
178
179
|
_ = sample_dense_array_2d[0:10, 0:10, 0:10]
|
|
179
180
|
|
|
180
181
|
with pytest.raises(IndexError, match="out of bounds"):
|
|
181
182
|
_ = sample_dense_array_2d[200:300]
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def test_array_object(temp_dir):
|
|
186
|
+
uri = str(Path(temp_dir) / "test_dense_1d")
|
|
187
|
+
array = create_cellarray(uri=uri, shape=(100,), attr_dtype=np.float32, sparse=False)
|
|
188
|
+
tdb_obj = tiledb.open(uri, "r")
|
|
189
|
+
alt_array = DenseCellArray(tiledb_array_obj=tdb_obj)
|
|
190
|
+
|
|
191
|
+
assert isinstance(array, DenseCellArray)
|
|
192
|
+
assert array.shape == alt_array.shape
|
|
193
|
+
assert array.ndim == alt_array.ndim
|
|
194
|
+
assert array.dim_names == alt_array.dim_names
|
|
195
|
+
assert "data" in array.attr_names
|
|
196
|
+
assert "data" in alt_array.attr_names
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import numpy as np
|
|
2
2
|
import scipy as sp
|
|
3
|
+
import tiledb
|
|
3
4
|
|
|
4
|
-
from cellarr_array import create_cellarray
|
|
5
|
+
from cellarr_array import DenseCellArray, SparseCellArray, create_cellarray
|
|
5
6
|
|
|
6
7
|
__author__ = "Jayaram Kancherla"
|
|
7
8
|
__copyright__ = "Jayaram Kancherla"
|
|
@@ -13,11 +14,15 @@ def test_inmem_uri():
|
|
|
13
14
|
arr = np.arange(100_000_000).reshape(shape)
|
|
14
15
|
uri = "mem://dense"
|
|
15
16
|
|
|
16
|
-
dense_inmem = create_cellarray(uri, shape=(shape))
|
|
17
|
+
dense_inmem = create_cellarray(uri=uri, shape=(shape))
|
|
17
18
|
dense_inmem.write_batch(arr, start_row=0)
|
|
18
19
|
|
|
19
20
|
assert np.allclose(dense_inmem[:10, :10], arr[:10, :10])
|
|
20
21
|
|
|
22
|
+
tdb_obj = tiledb.open(uri, "r")
|
|
23
|
+
alt_array = DenseCellArray(tiledb_array_obj=tdb_obj)
|
|
24
|
+
assert np.allclose(alt_array[:10, :10], arr[:10, :10])
|
|
25
|
+
|
|
21
26
|
|
|
22
27
|
def test_inmem_uri_sparse():
|
|
23
28
|
shape = (1000, 1000)
|
|
@@ -29,3 +34,7 @@ def test_inmem_uri_sparse():
|
|
|
29
34
|
dense_inmem.write_batch(s, start_row=0)
|
|
30
35
|
|
|
31
36
|
assert np.allclose(dense_inmem[:10, :10].toarray(), s.tocsr()[:10, :10].toarray())
|
|
37
|
+
|
|
38
|
+
tdb_obj = tiledb.open(uri, "r")
|
|
39
|
+
alt_array = SparseCellArray(tiledb_array_obj=tdb_obj)
|
|
40
|
+
assert np.allclose(alt_array[:10, :10].toarray(), s.tocsr()[:10, :10].toarray())
|
|
@@ -2,6 +2,7 @@ from pathlib import Path
|
|
|
2
2
|
|
|
3
3
|
import numpy as np
|
|
4
4
|
import pytest
|
|
5
|
+
import tiledb
|
|
5
6
|
from scipy import sparse
|
|
6
7
|
|
|
7
8
|
from cellarr_array import SparseCellArray, create_cellarray
|
|
@@ -44,7 +45,7 @@ def test_1d_write_batch(sample_sparse_array_1d):
|
|
|
44
45
|
expected = sparse_data.toarray().flatten()
|
|
45
46
|
np.testing.assert_array_almost_equal(result.toarray().flatten(), expected)
|
|
46
47
|
|
|
47
|
-
read_arr = SparseCellArray(sample_sparse_array_1d.uri, return_sparse=False)
|
|
48
|
+
read_arr = SparseCellArray(uri=sample_sparse_array_1d.uri, return_sparse=False)
|
|
48
49
|
|
|
49
50
|
# Full slice
|
|
50
51
|
result = read_arr[0:10]
|
|
@@ -68,7 +69,7 @@ def test_1d_empty_regions(sample_sparse_array_1d):
|
|
|
68
69
|
|
|
69
70
|
sample_sparse_array_1d.write_batch(sparse_data, start_row=0)
|
|
70
71
|
|
|
71
|
-
read_arr = SparseCellArray(sample_sparse_array_1d.uri, return_sparse=True)
|
|
72
|
+
read_arr = SparseCellArray(uri=sample_sparse_array_1d.uri, return_sparse=True)
|
|
72
73
|
|
|
73
74
|
# Query empty region
|
|
74
75
|
result = read_arr[7:10]
|
|
@@ -82,7 +83,7 @@ def test_2d_formats(sample_sparse_array_2d):
|
|
|
82
83
|
data = sparse.random(10, 50, density=0.1, format="coo", dtype=np.float32)
|
|
83
84
|
|
|
84
85
|
sample_sparse_array_2d.write_batch(data, start_row=0)
|
|
85
|
-
array_coo = SparseCellArray(sample_sparse_array_2d.uri, return_sparse=True)
|
|
86
|
+
array_coo = SparseCellArray(uri=sample_sparse_array_2d.uri, return_sparse=True)
|
|
86
87
|
result = array_coo[0:10, :]
|
|
87
88
|
np.testing.assert_array_almost_equal(result.toarray(), data.toarray())
|
|
88
89
|
|
|
@@ -91,7 +92,7 @@ def test_coo_output(sample_sparse_array_2d):
|
|
|
91
92
|
data = sparse.random(10, 50, density=0.1, format="coo", dtype=np.float32)
|
|
92
93
|
sample_sparse_array_2d.write_batch(data, start_row=0)
|
|
93
94
|
|
|
94
|
-
array_coo = SparseCellArray(sample_sparse_array_2d.uri, return_sparse=True)
|
|
95
|
+
array_coo = SparseCellArray(uri=sample_sparse_array_2d.uri, return_sparse=True)
|
|
95
96
|
|
|
96
97
|
# Test full slice
|
|
97
98
|
result = array_coo[0:10, :]
|
|
@@ -114,7 +115,7 @@ def test_mixed_slice_list_bounds(sample_sparse_array_2d):
|
|
|
114
115
|
data = sparse.random(100, 50, density=0.2, format="csr", dtype=np.float32)
|
|
115
116
|
sample_sparse_array_2d.write_batch(data, start_row=0)
|
|
116
117
|
|
|
117
|
-
array_coo = SparseCellArray(sample_sparse_array_2d.uri, return_sparse=True)
|
|
118
|
+
array_coo = SparseCellArray(uri=sample_sparse_array_2d.uri, return_sparse=True)
|
|
118
119
|
|
|
119
120
|
cols = [2, 4, 6]
|
|
120
121
|
|
|
@@ -142,7 +143,7 @@ def test_empty_regions(sample_sparse_array_2d):
|
|
|
142
143
|
data = sparse.random(10, 50, density=0.1, format="coo", dtype=np.float32)
|
|
143
144
|
sample_sparse_array_2d.write_batch(data, start_row=0)
|
|
144
145
|
|
|
145
|
-
array_coo = SparseCellArray(sample_sparse_array_2d.uri, return_sparse=True)
|
|
146
|
+
array_coo = SparseCellArray(uri=sample_sparse_array_2d.uri, return_sparse=True)
|
|
146
147
|
|
|
147
148
|
# Query empty region
|
|
148
149
|
result = array_coo[50:60, :]
|
|
@@ -178,3 +179,17 @@ def test_invalid_inputs(sample_sparse_array_2d):
|
|
|
178
179
|
attr_dtype=np.float32,
|
|
179
180
|
sparse=True,
|
|
180
181
|
)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def test_array_object(temp_dir):
|
|
185
|
+
uri = str(Path(temp_dir) / "test_sparse_1d")
|
|
186
|
+
array = create_cellarray(uri=uri, shape=(100,), attr_dtype=np.float32, sparse=True)
|
|
187
|
+
tdb_obj = tiledb.open(uri, "r")
|
|
188
|
+
alt_array = SparseCellArray(tiledb_array_obj=tdb_obj)
|
|
189
|
+
|
|
190
|
+
assert isinstance(array, SparseCellArray)
|
|
191
|
+
assert array.shape == alt_array.shape
|
|
192
|
+
assert array.ndim == alt_array.ndim
|
|
193
|
+
assert array.dim_names == alt_array.dim_names
|
|
194
|
+
assert "data" in array.attr_names
|
|
195
|
+
assert "data" in alt_array.attr_names
|
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
name: Run tests
|
|
2
|
-
|
|
3
|
-
on:
|
|
4
|
-
push:
|
|
5
|
-
branches: [master]
|
|
6
|
-
pull_request:
|
|
7
|
-
branches: [master]
|
|
8
|
-
|
|
9
|
-
jobs:
|
|
10
|
-
build:
|
|
11
|
-
runs-on: ubuntu-latest
|
|
12
|
-
strategy:
|
|
13
|
-
matrix:
|
|
14
|
-
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
|
|
15
|
-
|
|
16
|
-
name: Python ${{ matrix.python-version }}
|
|
17
|
-
steps:
|
|
18
|
-
- uses: actions/checkout@v4
|
|
19
|
-
|
|
20
|
-
- name: Setup Python
|
|
21
|
-
uses: actions/setup-python@v5
|
|
22
|
-
with:
|
|
23
|
-
python-version: ${{ matrix.python-version }}
|
|
24
|
-
cache: "pip"
|
|
25
|
-
|
|
26
|
-
- name: Install dependencies
|
|
27
|
-
run: |
|
|
28
|
-
python -m pip install --upgrade pip
|
|
29
|
-
pip install tox
|
|
30
|
-
|
|
31
|
-
- name: Test with tox
|
|
32
|
-
run: |
|
|
33
|
-
tox
|
|
@@ -1,251 +0,0 @@
|
|
|
1
|
-
from abc import ABC, abstractmethod
|
|
2
|
-
from contextlib import contextmanager
|
|
3
|
-
|
|
4
|
-
try:
|
|
5
|
-
from types import EllipsisType
|
|
6
|
-
except ImportError:
|
|
7
|
-
# TODO: This is required for Python <3.10. Remove once Python 3.9 reaches EOL in October 2025
|
|
8
|
-
EllipsisType = type(...)
|
|
9
|
-
from typing import List, Literal, Optional, Tuple, Union
|
|
10
|
-
|
|
11
|
-
import numpy as np
|
|
12
|
-
import tiledb
|
|
13
|
-
from scipy import sparse
|
|
14
|
-
|
|
15
|
-
from .config import ConsolidationConfig
|
|
16
|
-
from .helpers import SliceHelper
|
|
17
|
-
|
|
18
|
-
__author__ = "Jayaram Kancherla"
|
|
19
|
-
__copyright__ = "Jayaram Kancherla"
|
|
20
|
-
__license__ = "MIT"
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
class CellArray(ABC):
|
|
24
|
-
"""Abstract base class for TileDB array operations."""
|
|
25
|
-
|
|
26
|
-
def __init__(
|
|
27
|
-
self,
|
|
28
|
-
uri: str,
|
|
29
|
-
attr: str = "data",
|
|
30
|
-
mode: Optional[Literal["r", "w", "n", "d"]] = None,
|
|
31
|
-
config_or_context: Optional[Union[tiledb.Config, tiledb.Ctx]] = None,
|
|
32
|
-
validate: bool = True,
|
|
33
|
-
):
|
|
34
|
-
"""Initialize the object.
|
|
35
|
-
|
|
36
|
-
Args:
|
|
37
|
-
uri:
|
|
38
|
-
URI to the array.
|
|
39
|
-
|
|
40
|
-
attr:
|
|
41
|
-
Attribute to access.
|
|
42
|
-
Defaults to "data".
|
|
43
|
-
|
|
44
|
-
mode:
|
|
45
|
-
Open the array object in read 'r', write 'w', modify
|
|
46
|
-
exclusive 'm' mode, or delete 'd' mode.
|
|
47
|
-
|
|
48
|
-
Defaults to None for automatic mode switching.
|
|
49
|
-
|
|
50
|
-
config_or_context:
|
|
51
|
-
Optional config or context object.
|
|
52
|
-
|
|
53
|
-
Defaults to None.
|
|
54
|
-
|
|
55
|
-
validate:
|
|
56
|
-
Whether to validate the attributes.
|
|
57
|
-
Defaults to True.
|
|
58
|
-
"""
|
|
59
|
-
self.uri = uri
|
|
60
|
-
self._mode = mode
|
|
61
|
-
|
|
62
|
-
if config_or_context is None:
|
|
63
|
-
# config_or_context = tiledb.Config()
|
|
64
|
-
ctx = None
|
|
65
|
-
else:
|
|
66
|
-
if isinstance(config_or_context, tiledb.Config):
|
|
67
|
-
ctx = tiledb.Ctx(config_or_context)
|
|
68
|
-
elif isinstance(config_or_context, tiledb.Ctx):
|
|
69
|
-
ctx = config_or_context
|
|
70
|
-
else:
|
|
71
|
-
raise TypeError("'config_or_context' must be either TileDB config or a context object.")
|
|
72
|
-
|
|
73
|
-
self._ctx = ctx
|
|
74
|
-
self._array = None
|
|
75
|
-
self._shape = None
|
|
76
|
-
self._ndim = None
|
|
77
|
-
self._dim_names = None
|
|
78
|
-
self._attr_names = None
|
|
79
|
-
self._nonempty_domain = None
|
|
80
|
-
|
|
81
|
-
if validate:
|
|
82
|
-
self._validate(attr=attr)
|
|
83
|
-
|
|
84
|
-
self._attr = attr
|
|
85
|
-
|
|
86
|
-
def _validate(self, attr):
|
|
87
|
-
with self.open_array(mode="r") as A:
|
|
88
|
-
if A.ndim > 2:
|
|
89
|
-
raise ValueError("Only 1D and 2D arrays are supported.")
|
|
90
|
-
|
|
91
|
-
if attr not in self.attr_names:
|
|
92
|
-
raise ValueError(
|
|
93
|
-
f"Attribute '{attr}' does not exist in the array. Available attributes: {self.attr_names}."
|
|
94
|
-
)
|
|
95
|
-
|
|
96
|
-
@property
|
|
97
|
-
def mode(self) -> Optional[str]:
|
|
98
|
-
"""Get current array mode."""
|
|
99
|
-
return self._mode
|
|
100
|
-
|
|
101
|
-
@mode.setter
|
|
102
|
-
def mode(self, value: Optional[str]):
|
|
103
|
-
"""Set array mode.
|
|
104
|
-
|
|
105
|
-
Args:
|
|
106
|
-
value:
|
|
107
|
-
One of `None`, 'r', 'w', or 'm', 'd'.
|
|
108
|
-
"""
|
|
109
|
-
if value is not None and value not in ["r", "w", "m", "d"]:
|
|
110
|
-
raise ValueError("Mode must be one of: None, 'r', 'w', 'm', 'd'")
|
|
111
|
-
self._mode = value
|
|
112
|
-
|
|
113
|
-
@property
|
|
114
|
-
def dim_names(self) -> List[str]:
|
|
115
|
-
"""Get dimension names of the array."""
|
|
116
|
-
if self._dim_names is None:
|
|
117
|
-
with self.open_array(mode="r") as A:
|
|
118
|
-
self._dim_names = [dim.name for dim in A.schema.domain]
|
|
119
|
-
return self._dim_names
|
|
120
|
-
|
|
121
|
-
@property
|
|
122
|
-
def attr_names(self) -> List[str]:
|
|
123
|
-
"""Get attribute names of the array."""
|
|
124
|
-
if self._attr_names is None:
|
|
125
|
-
with self.open_array(mode="r") as A:
|
|
126
|
-
self._attr_names = [A.schema.attr(i).name for i in range(A.schema.nattr)]
|
|
127
|
-
return self._attr_names
|
|
128
|
-
|
|
129
|
-
@property
|
|
130
|
-
def shape(self) -> Tuple[int, ...]:
|
|
131
|
-
"""Get array shape from schema domain."""
|
|
132
|
-
if self._shape is None:
|
|
133
|
-
with self.open_array(mode="r") as A:
|
|
134
|
-
self._shape = tuple(int(dim.domain[1] - dim.domain[0] + 1) for dim in A.schema.domain)
|
|
135
|
-
return self._shape
|
|
136
|
-
|
|
137
|
-
@property
|
|
138
|
-
def nonempty_domain(self) -> Tuple[int, ...]:
|
|
139
|
-
"""Get array non-empty domain."""
|
|
140
|
-
if self._nonempty_domain is None:
|
|
141
|
-
with self.open_array(mode="r") as A:
|
|
142
|
-
self._nonempty_domain = A.nonempty_domain()
|
|
143
|
-
return self._nonempty_domain
|
|
144
|
-
|
|
145
|
-
@property
|
|
146
|
-
def ndim(self) -> int:
|
|
147
|
-
"""Get number of dimensions."""
|
|
148
|
-
if self._ndim is None:
|
|
149
|
-
self._ndim = len(self.shape)
|
|
150
|
-
return self._ndim
|
|
151
|
-
|
|
152
|
-
@contextmanager
|
|
153
|
-
def open_array(self, mode: Optional[str] = None):
|
|
154
|
-
"""Context manager for array operations.
|
|
155
|
-
|
|
156
|
-
Args:
|
|
157
|
-
mode:
|
|
158
|
-
Override mode for this operation.
|
|
159
|
-
"""
|
|
160
|
-
mode = mode if mode is not None else self.mode
|
|
161
|
-
mode = mode if mode is not None else "r" # Default to read mode
|
|
162
|
-
|
|
163
|
-
array = tiledb.open(self.uri, mode=mode, ctx=self._ctx)
|
|
164
|
-
try:
|
|
165
|
-
yield array
|
|
166
|
-
finally:
|
|
167
|
-
array.close()
|
|
168
|
-
|
|
169
|
-
def __getitem__(self, key: Union[slice, EllipsisType, Tuple[Union[slice, List[int]], ...], EllipsisType]):
|
|
170
|
-
"""Get item implementation that routes to either direct slicing or multi_index
|
|
171
|
-
based on the type of indices provided.
|
|
172
|
-
|
|
173
|
-
Args:
|
|
174
|
-
key:
|
|
175
|
-
Slice or list of indices for each dimension in the array.
|
|
176
|
-
"""
|
|
177
|
-
if not isinstance(key, tuple):
|
|
178
|
-
key = (key,)
|
|
179
|
-
|
|
180
|
-
if len(key) > self.ndim:
|
|
181
|
-
raise IndexError(f"Invalid number of dimensions: got {len(key)}, expected {self.ndim}")
|
|
182
|
-
|
|
183
|
-
# Normalize all indices
|
|
184
|
-
normalized_key = tuple(SliceHelper.normalize_index(idx, self.shape[i]) for i, idx in enumerate(key))
|
|
185
|
-
|
|
186
|
-
num_ellipsis = sum(isinstance(i, EllipsisType) for i in normalized_key)
|
|
187
|
-
if num_ellipsis > 1:
|
|
188
|
-
raise IndexError(f"Found more than 1 Ellipsis (...) in key: {normalized_key}")
|
|
189
|
-
|
|
190
|
-
# Check if we can use direct slicing
|
|
191
|
-
use_direct = all(isinstance(idx, (slice, EllipsisType)) for idx in normalized_key)
|
|
192
|
-
|
|
193
|
-
if use_direct:
|
|
194
|
-
return self._direct_slice(normalized_key)
|
|
195
|
-
else:
|
|
196
|
-
if num_ellipsis > 0:
|
|
197
|
-
raise IndexError(f"tiledb does not support ellipsis in multi-index access: {normalized_key}")
|
|
198
|
-
return self._multi_index(normalized_key)
|
|
199
|
-
|
|
200
|
-
@abstractmethod
|
|
201
|
-
def _direct_slice(self, key: Tuple[Union[slice, EllipsisType], ...]) -> np.ndarray:
|
|
202
|
-
"""Implementation for direct slicing."""
|
|
203
|
-
pass
|
|
204
|
-
|
|
205
|
-
@abstractmethod
|
|
206
|
-
def _multi_index(self, key: Tuple[Union[slice, List[int]], ...]) -> np.ndarray:
|
|
207
|
-
"""Implementation for multi-index access."""
|
|
208
|
-
pass
|
|
209
|
-
|
|
210
|
-
def vacuum(self) -> None:
|
|
211
|
-
"""Remove deleted fragments from the array."""
|
|
212
|
-
tiledb.vacuum(self.uri)
|
|
213
|
-
|
|
214
|
-
def consolidate(self, config: Optional[ConsolidationConfig] = None) -> None:
|
|
215
|
-
"""Consolidate array fragments.
|
|
216
|
-
|
|
217
|
-
Args:
|
|
218
|
-
config:
|
|
219
|
-
Optional consolidation configuration.
|
|
220
|
-
"""
|
|
221
|
-
if config is None:
|
|
222
|
-
config = ConsolidationConfig()
|
|
223
|
-
|
|
224
|
-
consolidation_cfg = tiledb.Config()
|
|
225
|
-
|
|
226
|
-
consolidation_cfg["sm.consolidation.steps"] = config.steps
|
|
227
|
-
consolidation_cfg["sm.consolidation.step_min_frags"] = config.step_min_frags
|
|
228
|
-
consolidation_cfg["sm.consolidation.step_max_frags"] = config.step_max_frags
|
|
229
|
-
consolidation_cfg["sm.consolidation.buffer_size"] = config.buffer_size
|
|
230
|
-
consolidation_cfg["sm.mem.total_budget"] = config.total_budget
|
|
231
|
-
|
|
232
|
-
tiledb.consolidate(self.uri, config=consolidation_cfg)
|
|
233
|
-
|
|
234
|
-
if config.vacuum_after:
|
|
235
|
-
self.vacuum()
|
|
236
|
-
|
|
237
|
-
@abstractmethod
|
|
238
|
-
def write_batch(self, data: Union[np.ndarray, sparse.spmatrix], start_row: int, **kwargs) -> None:
|
|
239
|
-
"""Write a batch of data to the array starting at the specified row.
|
|
240
|
-
|
|
241
|
-
Args:
|
|
242
|
-
data:
|
|
243
|
-
Data to write (numpy array for dense, scipy sparse matrix for sparse).
|
|
244
|
-
|
|
245
|
-
start_row:
|
|
246
|
-
Starting row index for writing.
|
|
247
|
-
|
|
248
|
-
**kwargs:
|
|
249
|
-
Additional arguments for write operation.
|
|
250
|
-
"""
|
|
251
|
-
pass
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|