cellarr-array 0.0.3__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cellarr-array might be problematic. Click here for more details.
- cellarr_array/__init__.py +2 -4
- cellarr_array/core/__init__.py +3 -0
- cellarr_array/core/base.py +344 -0
- cellarr_array/{DenseCellArray.py → core/dense.py} +2 -3
- cellarr_array/{helpers.py → core/helpers.py} +80 -42
- cellarr_array/{SparseCellArray.py → core/sparse.py} +75 -27
- cellarr_array/dataloaders/__init__.py +3 -0
- cellarr_array/dataloaders/denseloader.py +198 -0
- cellarr_array/dataloaders/iterabledataloader.py +320 -0
- cellarr_array/dataloaders/sparseloader.py +230 -0
- cellarr_array/dataloaders/utils.py +26 -0
- cellarr_array/utils/__init__.py +3 -0
- cellarr_array/utils/mock.py +167 -0
- {cellarr_array-0.0.3.dist-info → cellarr_array-0.2.0.dist-info}/METADATA +4 -1
- cellarr_array-0.2.0.dist-info/RECORD +19 -0
- {cellarr_array-0.0.3.dist-info → cellarr_array-0.2.0.dist-info}/WHEEL +1 -1
- {cellarr_array-0.0.3.dist-info → cellarr_array-0.2.0.dist-info}/licenses/LICENSE.txt +1 -1
- cellarr_array/CellArray.py +0 -251
- cellarr_array-0.0.3.dist-info/RECORD +0 -11
- /cellarr_array/{config.py → utils/config.py} +0 -0
- {cellarr_array-0.0.3.dist-info → cellarr_array-0.2.0.dist-info}/top_level.txt +0 -0
cellarr_array/__init__.py
CHANGED
|
@@ -15,7 +15,5 @@ except PackageNotFoundError: # pragma: no cover
|
|
|
15
15
|
finally:
|
|
16
16
|
del version, PackageNotFoundError
|
|
17
17
|
|
|
18
|
-
from .
|
|
19
|
-
from .
|
|
20
|
-
from .SparseCellArray import SparseCellArray
|
|
21
|
-
from .helpers import create_cellarray, SliceHelper
|
|
18
|
+
from .core import DenseCellArray, SparseCellArray
|
|
19
|
+
from .utils import CellArrConfig, ConsolidationConfig, create_cellarray
|
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
|
|
4
|
+
try:
|
|
5
|
+
from types import EllipsisType
|
|
6
|
+
except ImportError:
|
|
7
|
+
# TODO: This is required for Python <3.10. Remove once Python 3.9 reaches EOL in October 2025
|
|
8
|
+
EllipsisType = type(...)
|
|
9
|
+
from typing import Any, List, Literal, Optional, Tuple, Union
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
import tiledb
|
|
13
|
+
from scipy import sparse
|
|
14
|
+
|
|
15
|
+
from ..utils.config import ConsolidationConfig
|
|
16
|
+
from .helpers import SliceHelper
|
|
17
|
+
|
|
18
|
+
__author__ = "Jayaram Kancherla"
|
|
19
|
+
__copyright__ = "Jayaram Kancherla"
|
|
20
|
+
__license__ = "MIT"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class CellArray(ABC):
|
|
24
|
+
"""Abstract base class for TileDB array operations."""
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
uri: Optional[str] = None,
|
|
29
|
+
tiledb_array_obj: Optional[tiledb.Array] = None,
|
|
30
|
+
attr: str = "data",
|
|
31
|
+
mode: Optional[Literal["r", "w", "d", "m"]] = None,
|
|
32
|
+
config_or_context: Optional[Union[tiledb.Config, tiledb.Ctx]] = None,
|
|
33
|
+
validate: bool = True,
|
|
34
|
+
):
|
|
35
|
+
"""Initialize the object.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
uri:
|
|
39
|
+
URI to the array.
|
|
40
|
+
Required if 'tiledb_array_obj' is not provided.
|
|
41
|
+
|
|
42
|
+
tiledb_array_obj:
|
|
43
|
+
Optional, an already opened ``tiledb.Array`` instance.
|
|
44
|
+
If provided, 'uri' can be None, and 'config_or_context' is ignored.
|
|
45
|
+
|
|
46
|
+
attr:
|
|
47
|
+
Attribute to access.
|
|
48
|
+
Defaults to "data".
|
|
49
|
+
|
|
50
|
+
mode:
|
|
51
|
+
Open the array object in read 'r', write 'w', modify
|
|
52
|
+
'm' mode, or delete 'd' mode.
|
|
53
|
+
|
|
54
|
+
Defaults to None for automatic mode switching.
|
|
55
|
+
|
|
56
|
+
If 'tiledb_array_obj' is provided, this mode should ideally match
|
|
57
|
+
the mode of the provided array or be None.
|
|
58
|
+
|
|
59
|
+
config_or_context:
|
|
60
|
+
Optional config or context object. Ignored if 'tiledb_array_obj' is provided,
|
|
61
|
+
as context will be derived from the object.
|
|
62
|
+
|
|
63
|
+
Defaults to None.
|
|
64
|
+
|
|
65
|
+
validate:
|
|
66
|
+
Whether to validate the attributes.
|
|
67
|
+
Defaults to True.
|
|
68
|
+
"""
|
|
69
|
+
self._array_passed_in = False
|
|
70
|
+
self._opened_array_external = None
|
|
71
|
+
self._ctx = None
|
|
72
|
+
|
|
73
|
+
if tiledb_array_obj is not None:
|
|
74
|
+
if not isinstance(tiledb_array_obj, tiledb.Array):
|
|
75
|
+
raise ValueError("'tiledb_array_obj' must be a tiledb.Array instance.")
|
|
76
|
+
|
|
77
|
+
if not tiledb_array_obj.isopen:
|
|
78
|
+
# Option 1: Raise error
|
|
79
|
+
raise ValueError("If 'tiledb_array_obj' is provided, it must be an open tiledb.Array instance.")
|
|
80
|
+
# Option 2: Try to reopen (less safe as we don't know original intent)
|
|
81
|
+
# try:
|
|
82
|
+
# tiledb_array_obj.reopen()
|
|
83
|
+
# except tiledb.TileDBError as e:
|
|
84
|
+
# raise ValueError(
|
|
85
|
+
# f"Provided 'tiledb_array_obj' is closed and could not be reopened: {e}"
|
|
86
|
+
# )
|
|
87
|
+
|
|
88
|
+
self.uri = tiledb_array_obj.uri
|
|
89
|
+
self._array_passed_in = True
|
|
90
|
+
self._opened_array_external = tiledb_array_obj
|
|
91
|
+
|
|
92
|
+
# infer mode if possible, or require it matches
|
|
93
|
+
if mode is not None and tiledb_array_obj.mode != mode:
|
|
94
|
+
# we could try to reopen with the desired mode
|
|
95
|
+
raise ValueError(
|
|
96
|
+
f"Provided array mode '{tiledb_array_obj.mode}' does not match requested mode '{mode}'.",
|
|
97
|
+
"Re-open the external array with the desired mode or pass matching mode.",
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
self._mode = tiledb_array_obj.mode
|
|
101
|
+
self._ctx = tiledb_array_obj.ctx
|
|
102
|
+
elif uri is not None:
|
|
103
|
+
self.uri = uri
|
|
104
|
+
self._mode = mode
|
|
105
|
+
self._array_passed_in = False
|
|
106
|
+
self._opened_array_external = None
|
|
107
|
+
|
|
108
|
+
if config_or_context is None:
|
|
109
|
+
self._ctx = None
|
|
110
|
+
elif isinstance(config_or_context, tiledb.Config):
|
|
111
|
+
self._ctx = tiledb.Ctx(config_or_context)
|
|
112
|
+
elif isinstance(config_or_context, tiledb.Ctx):
|
|
113
|
+
self._ctx = config_or_context
|
|
114
|
+
else:
|
|
115
|
+
raise TypeError("'config_or_context' must be a TileDB Config or Ctx object.")
|
|
116
|
+
else:
|
|
117
|
+
raise ValueError("Either 'uri' or 'tiledb_array_obj' must be provided.")
|
|
118
|
+
|
|
119
|
+
self._shape = None
|
|
120
|
+
self._ndim = None
|
|
121
|
+
self._dim_names = None
|
|
122
|
+
self._attr_names = None
|
|
123
|
+
self._nonempty_domain = None
|
|
124
|
+
|
|
125
|
+
if validate:
|
|
126
|
+
self._validate(attr=attr)
|
|
127
|
+
|
|
128
|
+
self._attr = attr
|
|
129
|
+
|
|
130
|
+
def _validate(self, attr):
|
|
131
|
+
with self.open_array(mode="r") as A:
|
|
132
|
+
schema = A.schema
|
|
133
|
+
if schema.ndim > 2:
|
|
134
|
+
raise ValueError("Only 1D and 2D arrays are supported.")
|
|
135
|
+
|
|
136
|
+
current_attr_names = [schema.attr(i).name for i in range(schema.nattr)]
|
|
137
|
+
if attr not in current_attr_names:
|
|
138
|
+
raise ValueError(
|
|
139
|
+
f"Attribute '{attr}' does not exist in the array. Available attributes: {current_attr_names}."
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
@property
|
|
143
|
+
def mode(self) -> Optional[str]:
|
|
144
|
+
"""Get current array mode. If an external array is used, this is its open mode."""
|
|
145
|
+
if self._array_passed_in and self._opened_array_external is not None:
|
|
146
|
+
return self._opened_array_external.mode
|
|
147
|
+
return self._mode
|
|
148
|
+
|
|
149
|
+
@mode.setter
|
|
150
|
+
def mode(self, value: Optional[str]):
|
|
151
|
+
"""Set array mode for subsequent operations if not using an external array.
|
|
152
|
+
|
|
153
|
+
This action does not affect an already passed-in external array's mode.
|
|
154
|
+
"""
|
|
155
|
+
if self._array_passed_in:
|
|
156
|
+
# To change mode of an external array, user must reopen it and pass it again.
|
|
157
|
+
current_ext_mode = self._opened_array_external.mode if self._opened_array_external else "unknown"
|
|
158
|
+
if value != current_ext_mode:
|
|
159
|
+
raise ValueError(
|
|
160
|
+
f"Cannot change mode of an externally managed array (current: {current_ext_mode}). "
|
|
161
|
+
"Re-open the external array with the new mode and re-initialize CellArray."
|
|
162
|
+
)
|
|
163
|
+
if value is not None and value not in ["r", "w", "m", "d"]:
|
|
164
|
+
raise ValueError("Mode must be one of: None, 'r', 'w', 'm', 'd'")
|
|
165
|
+
|
|
166
|
+
self._mode = value
|
|
167
|
+
|
|
168
|
+
@property
|
|
169
|
+
def dim_names(self) -> List[str]:
|
|
170
|
+
"""Get dimension names of the array."""
|
|
171
|
+
if self._dim_names is None:
|
|
172
|
+
with self.open_array(mode="r") as A:
|
|
173
|
+
self._dim_names = [dim.name for dim in A.schema.domain]
|
|
174
|
+
return self._dim_names
|
|
175
|
+
|
|
176
|
+
@property
|
|
177
|
+
def attr_names(self) -> List[str]:
|
|
178
|
+
"""Get attribute names of the array."""
|
|
179
|
+
if self._attr_names is None:
|
|
180
|
+
with self.open_array(mode="r") as A:
|
|
181
|
+
self._attr_names = [A.schema.attr(i).name for i in range(A.schema.nattr)]
|
|
182
|
+
return self._attr_names
|
|
183
|
+
|
|
184
|
+
@property
|
|
185
|
+
def shape(self) -> Tuple[int, ...]:
|
|
186
|
+
if self._shape is None:
|
|
187
|
+
with self.open_array(mode="r") as A:
|
|
188
|
+
self._shape = tuple(int(dim.domain[1] - dim.domain[0] + 1) for dim in A.schema.domain)
|
|
189
|
+
return self._shape
|
|
190
|
+
|
|
191
|
+
@property
|
|
192
|
+
def nonempty_domain(self) -> Optional[Tuple[Any, ...]]:
|
|
193
|
+
if self._nonempty_domain is None:
|
|
194
|
+
with self.open_array(mode="r") as A:
|
|
195
|
+
# nonempty_domain() can return None if the array is empty.
|
|
196
|
+
ned = A.nonempty_domain()
|
|
197
|
+
if ned is None:
|
|
198
|
+
self._nonempty_domain = None
|
|
199
|
+
else:
|
|
200
|
+
self._nonempty_domain = tuple(ned) if isinstance(ned[0], tuple) else (ned,)
|
|
201
|
+
return self._nonempty_domain
|
|
202
|
+
|
|
203
|
+
@property
|
|
204
|
+
def ndim(self) -> int:
|
|
205
|
+
"""Get number of dimensions."""
|
|
206
|
+
if self._ndim is None:
|
|
207
|
+
with self.open_array(mode="r") as A:
|
|
208
|
+
self._ndim = A.schema.ndim
|
|
209
|
+
# self._ndim = len(self.shape)
|
|
210
|
+
return self._ndim
|
|
211
|
+
|
|
212
|
+
@contextmanager
|
|
213
|
+
def open_array(self, mode: Optional[str] = None):
|
|
214
|
+
"""Context manager for array operations.
|
|
215
|
+
|
|
216
|
+
Uses the externally provided array if available, otherwise opens from URI.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
mode:
|
|
220
|
+
Desired mode for the operation ('r', 'w', 'm', 'd').
|
|
221
|
+
If an external array is used, this mode must be compatible with
|
|
222
|
+
(or same as) the mode the external array was opened with.
|
|
223
|
+
|
|
224
|
+
If None, uses the CellArray's default mode.
|
|
225
|
+
"""
|
|
226
|
+
if self._array_passed_in and self._opened_array_external is not None:
|
|
227
|
+
if not self._opened_array_external.isopen:
|
|
228
|
+
# Attempt to reopen if closed. This assumes the user might have closed it
|
|
229
|
+
# and expects CellArr to reopen it if still possible.
|
|
230
|
+
try:
|
|
231
|
+
self._opened_array_external.reopen()
|
|
232
|
+
except Exception as e:
|
|
233
|
+
raise tiledb.TileDBError(
|
|
234
|
+
f"Externally provided array is closed and could not be reopened: {e}"
|
|
235
|
+
) from e
|
|
236
|
+
|
|
237
|
+
effective_mode = mode if mode is not None else self._opened_array_external.mode
|
|
238
|
+
|
|
239
|
+
current_external_mode = self._opened_array_external.mode
|
|
240
|
+
if effective_mode == "r" and current_external_mode not in ["r", "w", "m"]:
|
|
241
|
+
# Read ops ok on write/modify modes
|
|
242
|
+
pass
|
|
243
|
+
elif effective_mode in ["w", "d"] and current_external_mode != effective_mode:
|
|
244
|
+
raise tiledb.TileDBError(
|
|
245
|
+
f"Requested operation mode '{effective_mode}' is incompatible with the "
|
|
246
|
+
f"externally provided array's mode '{current_external_mode}'. "
|
|
247
|
+
"Ensure the external array is opened in a compatible mode."
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
# DO NOT close self._opened_array_external here; its lifecycle is managed by the user.
|
|
251
|
+
yield self._opened_array_external
|
|
252
|
+
else:
|
|
253
|
+
effective_mode = mode if mode is not None else self.mode
|
|
254
|
+
effective_mode = effective_mode if effective_mode is not None else "r"
|
|
255
|
+
array = tiledb.open(self.uri, mode=effective_mode, ctx=self._ctx)
|
|
256
|
+
|
|
257
|
+
try:
|
|
258
|
+
yield array
|
|
259
|
+
finally:
|
|
260
|
+
array.close()
|
|
261
|
+
|
|
262
|
+
def __getitem__(self, key: Union[slice, EllipsisType, Tuple[Union[slice, List[int]], ...], EllipsisType]):
|
|
263
|
+
"""Get item implementation that routes to either direct slicing or multi_index
|
|
264
|
+
based on the type of indices provided.
|
|
265
|
+
|
|
266
|
+
Args:
|
|
267
|
+
key:
|
|
268
|
+
Slice or list of indices for each dimension in the array.
|
|
269
|
+
"""
|
|
270
|
+
if not isinstance(key, tuple):
|
|
271
|
+
key = (key,)
|
|
272
|
+
|
|
273
|
+
if len(key) > self.ndim:
|
|
274
|
+
raise IndexError(f"Invalid number of dimensions: got {len(key)}, expected {self.ndim}")
|
|
275
|
+
|
|
276
|
+
# Normalize all indices
|
|
277
|
+
normalized_key = tuple(SliceHelper.normalize_index(idx, self.shape[i]) for i, idx in enumerate(key))
|
|
278
|
+
|
|
279
|
+
num_ellipsis = sum(isinstance(i, EllipsisType) for i in normalized_key)
|
|
280
|
+
if num_ellipsis > 1:
|
|
281
|
+
raise IndexError(f"Found more than 1 Ellipsis (...) in key: {normalized_key}")
|
|
282
|
+
|
|
283
|
+
# Check if we can use direct slicing
|
|
284
|
+
use_direct = all(isinstance(idx, (slice, EllipsisType)) for idx in normalized_key)
|
|
285
|
+
|
|
286
|
+
if use_direct:
|
|
287
|
+
return self._direct_slice(normalized_key)
|
|
288
|
+
else:
|
|
289
|
+
if num_ellipsis > 0:
|
|
290
|
+
raise IndexError(f"tiledb does not support ellipsis in multi-index access: {normalized_key}")
|
|
291
|
+
return self._multi_index(normalized_key)
|
|
292
|
+
|
|
293
|
+
@abstractmethod
|
|
294
|
+
def _direct_slice(self, key: Tuple[Union[slice, EllipsisType], ...]) -> np.ndarray:
|
|
295
|
+
"""Implementation for direct slicing."""
|
|
296
|
+
pass
|
|
297
|
+
|
|
298
|
+
@abstractmethod
|
|
299
|
+
def _multi_index(self, key: Tuple[Union[slice, List[int]], ...]) -> np.ndarray:
|
|
300
|
+
"""Implementation for multi-index access."""
|
|
301
|
+
pass
|
|
302
|
+
|
|
303
|
+
def vacuum(self) -> None:
|
|
304
|
+
"""Remove deleted fragments from the array."""
|
|
305
|
+
tiledb.vacuum(self.uri)
|
|
306
|
+
|
|
307
|
+
def consolidate(self, config: Optional[ConsolidationConfig] = None) -> None:
|
|
308
|
+
"""Consolidate array fragments.
|
|
309
|
+
|
|
310
|
+
Args:
|
|
311
|
+
config:
|
|
312
|
+
Optional consolidation configuration.
|
|
313
|
+
"""
|
|
314
|
+
if config is None:
|
|
315
|
+
config = ConsolidationConfig()
|
|
316
|
+
|
|
317
|
+
consolidation_cfg = tiledb.Config()
|
|
318
|
+
|
|
319
|
+
consolidation_cfg["sm.consolidation.steps"] = config.steps
|
|
320
|
+
consolidation_cfg["sm.consolidation.step_min_frags"] = config.step_min_frags
|
|
321
|
+
consolidation_cfg["sm.consolidation.step_max_frags"] = config.step_max_frags
|
|
322
|
+
consolidation_cfg["sm.consolidation.buffer_size"] = config.buffer_size
|
|
323
|
+
consolidation_cfg["sm.mem.total_budget"] = config.total_budget
|
|
324
|
+
|
|
325
|
+
tiledb.consolidate(self.uri, config=consolidation_cfg)
|
|
326
|
+
|
|
327
|
+
if config.vacuum_after:
|
|
328
|
+
self.vacuum()
|
|
329
|
+
|
|
330
|
+
@abstractmethod
|
|
331
|
+
def write_batch(self, data: Union[np.ndarray, sparse.spmatrix], start_row: int, **kwargs) -> None:
|
|
332
|
+
"""Write a batch of data to the array starting at the specified row.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
data:
|
|
336
|
+
Data to write (numpy array for dense, scipy sparse matrix for sparse).
|
|
337
|
+
|
|
338
|
+
start_row:
|
|
339
|
+
Starting row index for writing.
|
|
340
|
+
|
|
341
|
+
**kwargs:
|
|
342
|
+
Additional arguments for write operation.
|
|
343
|
+
"""
|
|
344
|
+
pass
|
|
@@ -7,7 +7,7 @@ from typing import List, Tuple, Union
|
|
|
7
7
|
|
|
8
8
|
import numpy as np
|
|
9
9
|
|
|
10
|
-
from .
|
|
10
|
+
from .base import CellArray
|
|
11
11
|
from .helpers import SliceHelper
|
|
12
12
|
|
|
13
13
|
__author__ = "Jayaram Kancherla"
|
|
@@ -92,7 +92,6 @@ class DenseCellArray(CellArray):
|
|
|
92
92
|
if len(data.shape) != self.ndim:
|
|
93
93
|
raise ValueError(f"Data dimensions {data.shape} don't match array dimensions {self.shape}.")
|
|
94
94
|
|
|
95
|
-
# Check bounds
|
|
96
95
|
end_row = start_row + data.shape[0]
|
|
97
96
|
if end_row > self.shape[0]:
|
|
98
97
|
raise ValueError(
|
|
@@ -102,7 +101,6 @@ class DenseCellArray(CellArray):
|
|
|
102
101
|
if self.ndim == 2 and data.shape[1] != self.shape[1]:
|
|
103
102
|
raise ValueError(f"Data columns {data.shape[1]} don't match array columns {self.shape[1]}.")
|
|
104
103
|
|
|
105
|
-
# Construct write region
|
|
106
104
|
if self.ndim == 1:
|
|
107
105
|
write_region = slice(start_row, end_row)
|
|
108
106
|
else: # 2D
|
|
@@ -110,4 +108,5 @@ class DenseCellArray(CellArray):
|
|
|
110
108
|
|
|
111
109
|
# write_data = {self._attr: data} if len(self.attr_names) > 1 else data
|
|
112
110
|
with self.open_array(mode="w") as array:
|
|
111
|
+
print("write_region", write_region)
|
|
113
112
|
array[write_region] = data
|
|
@@ -8,7 +8,7 @@ from typing import List, Optional, Tuple, Union
|
|
|
8
8
|
import numpy as np
|
|
9
9
|
import tiledb
|
|
10
10
|
|
|
11
|
-
from .config import CellArrConfig
|
|
11
|
+
from ..utils.config import CellArrConfig
|
|
12
12
|
|
|
13
13
|
__author__ = "Jayaram Kancherla"
|
|
14
14
|
__copyright__ = "Jayaram Kancherla"
|
|
@@ -52,7 +52,7 @@ def create_cellarray(
|
|
|
52
52
|
Optional list of dimension names.
|
|
53
53
|
|
|
54
54
|
dim_dtypes:
|
|
55
|
-
Optional list of dimension dtypes.
|
|
55
|
+
Optional list of dimension dtypes. Defaults to numpy's uint32.
|
|
56
56
|
|
|
57
57
|
attr_name:
|
|
58
58
|
Name of the data attribute.
|
|
@@ -67,29 +67,28 @@ def create_cellarray(
|
|
|
67
67
|
ValueError: If dimensions are invalid or inputs are inconsistent.
|
|
68
68
|
"""
|
|
69
69
|
config = config or CellArrConfig()
|
|
70
|
+
tiledb_ctx = tiledb.Config(config.ctx_config) if config.ctx_config else None
|
|
70
71
|
|
|
71
72
|
if attr_dtype is None:
|
|
72
73
|
attr_dtype = np.float32
|
|
73
74
|
if isinstance(attr_dtype, str):
|
|
74
75
|
attr_dtype = np.dtype(attr_dtype)
|
|
75
76
|
|
|
76
|
-
# Require either shape or dim_dtypes
|
|
77
77
|
if shape is None and dim_dtypes is None:
|
|
78
78
|
raise ValueError("Either 'shape' or 'dim_dtypes' must be provided.")
|
|
79
79
|
|
|
80
80
|
if shape is not None:
|
|
81
81
|
if len(shape) not in (1, 2):
|
|
82
|
-
raise ValueError("
|
|
82
|
+
raise ValueError("Shape must have 1 or 2 dimensions.")
|
|
83
83
|
|
|
84
84
|
# Set dimension dtypes, defaults to numpy uint32
|
|
85
85
|
if dim_dtypes is None:
|
|
86
86
|
dim_dtypes = [np.uint32] * len(shape)
|
|
87
87
|
else:
|
|
88
88
|
if len(dim_dtypes) not in (1, 2):
|
|
89
|
-
raise ValueError("
|
|
89
|
+
raise ValueError("Array must have 1 or 2 dimensions.")
|
|
90
90
|
dim_dtypes = [np.dtype(dt) if isinstance(dt, str) else dt for dt in dim_dtypes]
|
|
91
91
|
|
|
92
|
-
# Calculate shape from dtypes if needed
|
|
93
92
|
if shape is None:
|
|
94
93
|
shape = tuple(np.iinfo(dt).max if np.issubdtype(dt, np.integer) else None for dt in dim_dtypes)
|
|
95
94
|
if None in shape:
|
|
@@ -97,7 +96,6 @@ def create_cellarray(
|
|
|
97
96
|
np.iinfo(dt).max if s is None and np.issubdtype(dt, np.integer) else s for s, dt in zip(shape, dim_dtypes)
|
|
98
97
|
)
|
|
99
98
|
|
|
100
|
-
# Set dimension names
|
|
101
99
|
if dim_names is None:
|
|
102
100
|
dim_names = [f"dim_{i}" for i in range(len(shape))]
|
|
103
101
|
|
|
@@ -107,37 +105,44 @@ def create_cellarray(
|
|
|
107
105
|
|
|
108
106
|
dom = tiledb.Domain(
|
|
109
107
|
*[
|
|
110
|
-
tiledb.Dim(
|
|
108
|
+
tiledb.Dim(
|
|
109
|
+
name=name,
|
|
110
|
+
# supporting empty dimensions
|
|
111
|
+
domain=(0, 0 if s == 0 else s - 1),
|
|
112
|
+
tile=min(1 if s == 0 else s // 2, config.tile_capacity // 2),
|
|
113
|
+
dtype=dt,
|
|
114
|
+
)
|
|
111
115
|
for name, s, dt in zip(dim_names, shape, dim_dtypes)
|
|
112
116
|
],
|
|
113
|
-
ctx=
|
|
117
|
+
ctx=tiledb_ctx,
|
|
114
118
|
)
|
|
115
|
-
|
|
116
|
-
attr = tiledb.Attr(
|
|
119
|
+
attr_obj = tiledb.Attr(
|
|
117
120
|
name=attr_name,
|
|
118
121
|
dtype=attr_dtype,
|
|
119
122
|
filters=config.attrs_filters.get(attr_name, config.attrs_filters.get("", None)),
|
|
123
|
+
ctx=tiledb_ctx,
|
|
120
124
|
)
|
|
121
|
-
|
|
122
125
|
schema = tiledb.ArraySchema(
|
|
123
126
|
domain=dom,
|
|
124
|
-
attrs=[
|
|
127
|
+
attrs=[attr_obj],
|
|
125
128
|
cell_order=config.cell_order,
|
|
126
129
|
tile_order=config.tile_order,
|
|
127
130
|
sparse=sparse,
|
|
128
131
|
coords_filters=config.coords_filters,
|
|
129
132
|
offsets_filters=config.offsets_filters,
|
|
130
|
-
ctx=
|
|
133
|
+
ctx=tiledb_ctx,
|
|
131
134
|
)
|
|
132
|
-
|
|
133
|
-
tiledb.Array.create(uri, schema)
|
|
135
|
+
tiledb.Array.create(uri, schema, ctx=tiledb_ctx)
|
|
134
136
|
|
|
135
137
|
# Import here to avoid circular imports
|
|
136
|
-
from .
|
|
137
|
-
from .
|
|
138
|
+
from .dense import DenseCellArray
|
|
139
|
+
from .sparse import SparseCellArray
|
|
138
140
|
|
|
139
|
-
|
|
140
|
-
|
|
141
|
+
return (
|
|
142
|
+
SparseCellArray(uri=uri, attr=attr_name, mode=mode, config_or_context=tiledb_ctx)
|
|
143
|
+
if sparse
|
|
144
|
+
else DenseCellArray(uri=uri, attr=attr_name, mode=mode, config_or_context=tiledb_ctx)
|
|
145
|
+
)
|
|
141
146
|
|
|
142
147
|
|
|
143
148
|
class SliceHelper:
|
|
@@ -145,19 +150,27 @@ class SliceHelper:
|
|
|
145
150
|
|
|
146
151
|
@staticmethod
|
|
147
152
|
def is_contiguous_indices(indices: List[int]) -> Optional[slice]:
|
|
148
|
-
"""Check if indices can be represented as a contiguous slice."""
|
|
149
153
|
if not indices:
|
|
150
154
|
return None
|
|
151
155
|
|
|
152
|
-
|
|
156
|
+
sorted_indices = sorted(list(set(indices)))
|
|
157
|
+
if not sorted_indices:
|
|
158
|
+
return None
|
|
159
|
+
|
|
160
|
+
if len(sorted_indices) == 1:
|
|
161
|
+
return slice(sorted_indices[0], sorted_indices[0] + 1, None)
|
|
162
|
+
|
|
163
|
+
diffs = np.diff(sorted_indices)
|
|
153
164
|
if np.all(diffs == 1):
|
|
154
|
-
return slice(
|
|
165
|
+
return slice(sorted_indices[0], sorted_indices[-1] + 1, None)
|
|
166
|
+
|
|
155
167
|
return None
|
|
156
168
|
|
|
157
169
|
@staticmethod
|
|
158
|
-
def normalize_index(
|
|
170
|
+
def normalize_index(
|
|
171
|
+
idx: Union[int, range, slice, List[int], EllipsisType], dim_size: int
|
|
172
|
+
) -> Union[slice, List[int], EllipsisType]:
|
|
159
173
|
"""Normalize index to handle negative indices and ensure consistency."""
|
|
160
|
-
|
|
161
174
|
if isinstance(idx, EllipsisType):
|
|
162
175
|
return idx
|
|
163
176
|
|
|
@@ -166,36 +179,61 @@ class SliceHelper:
|
|
|
166
179
|
idx = slice(idx.start, idx.stop, idx.step)
|
|
167
180
|
|
|
168
181
|
if isinstance(idx, slice):
|
|
169
|
-
start = idx.start
|
|
170
|
-
stop = idx.stop
|
|
182
|
+
start = idx.start
|
|
183
|
+
stop = idx.stop
|
|
171
184
|
step = idx.step
|
|
172
185
|
|
|
186
|
+
# Resolve None to full dimension slice parts
|
|
187
|
+
if start is None:
|
|
188
|
+
start = 0
|
|
189
|
+
|
|
190
|
+
if stop is None:
|
|
191
|
+
stop = dim_size
|
|
192
|
+
|
|
173
193
|
# Handle negative indices
|
|
174
194
|
if start < 0:
|
|
175
|
-
start
|
|
176
|
-
|
|
195
|
+
start += dim_size
|
|
177
196
|
if stop < 0:
|
|
178
|
-
stop
|
|
197
|
+
stop += dim_size
|
|
179
198
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
if
|
|
183
|
-
|
|
199
|
+
# slice allows start > dim_size or stop < 0 to result in empty slices.
|
|
200
|
+
# Note: start == dim_size is OK for empty slice like arr[dim_size:]
|
|
201
|
+
if start < 0 or (start >= dim_size and dim_size > 0):
|
|
202
|
+
if not (start == dim_size and (step is None or step > 0)):
|
|
203
|
+
if start >= dim_size:
|
|
204
|
+
raise IndexError(
|
|
205
|
+
f"Start index {idx.start if idx.start is not None else 'None'} results in {start}, which is out of bounds for dimension size {dim_size}."
|
|
206
|
+
)
|
|
184
207
|
|
|
185
|
-
|
|
208
|
+
# Clamping slice arguments to dimensions
|
|
209
|
+
stop = min(stop, dim_size)
|
|
210
|
+
start = max(0, start)
|
|
186
211
|
|
|
212
|
+
return slice(start, stop, step)
|
|
187
213
|
elif isinstance(idx, list):
|
|
214
|
+
if not idx:
|
|
215
|
+
return []
|
|
216
|
+
|
|
188
217
|
norm_idx = [i if i >= 0 else dim_size + i for i in idx]
|
|
189
218
|
if any(i < 0 or i >= dim_size for i in norm_idx):
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
219
|
+
oob_indices = [orig_i for orig_i, norm_i in zip(idx, norm_idx) if not (0 <= norm_i < dim_size)]
|
|
220
|
+
raise IndexError(
|
|
221
|
+
f"List indices {oob_indices} (original values) are out of bounds for dimension size {dim_size}."
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
# TileDB multi_index usually returns data sorted by coordinates
|
|
225
|
+
return sorted(list(set(norm_idx)))
|
|
226
|
+
elif isinstance(idx, (int, np.integer)):
|
|
227
|
+
norm_idx = int(idx)
|
|
228
|
+
if norm_idx < 0:
|
|
229
|
+
norm_idx += dim_size
|
|
230
|
+
|
|
231
|
+
if not (0 <= norm_idx < dim_size):
|
|
197
232
|
raise IndexError(f"Index {idx} out of bounds for dimension size {dim_size}")
|
|
233
|
+
|
|
198
234
|
return slice(norm_idx, norm_idx + 1, None)
|
|
235
|
+
else:
|
|
236
|
+
raise TypeError(f"Index type {type(idx)} not supported for normalization.")
|
|
199
237
|
|
|
200
238
|
|
|
201
239
|
def create_group(output_path, group_name):
|