cellarr-array 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cellarr-array might be problematic. Click here for more details.
- cellarr_array/CellArray.py +236 -0
- cellarr_array/DenseCellArray.py +108 -0
- cellarr_array/SparseCellArray.py +202 -0
- cellarr_array/__init__.py +21 -0
- cellarr_array/config.py +74 -0
- cellarr_array/helpers.py +194 -0
- cellarr_array-0.0.1.dist-info/LICENSE.txt +21 -0
- cellarr_array-0.0.1.dist-info/METADATA +161 -0
- cellarr_array-0.0.1.dist-info/RECORD +11 -0
- cellarr_array-0.0.1.dist-info/WHEEL +5 -0
- cellarr_array-0.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
from typing import List, Literal, Optional, Tuple, Union
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import tiledb
|
|
7
|
+
from scipy import sparse
|
|
8
|
+
|
|
9
|
+
from .config import ConsolidationConfig
|
|
10
|
+
from .helpers import SliceHelper
|
|
11
|
+
|
|
12
|
+
__author__ = "Jayaram Kancherla"
|
|
13
|
+
__copyright__ = "Jayaram Kancherla"
|
|
14
|
+
__license__ = "MIT"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class CellArray(ABC):
|
|
18
|
+
"""Abstract base class for TileDB array operations."""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
uri: str,
|
|
23
|
+
attr: str = "data",
|
|
24
|
+
mode: Optional[Literal["r", "w", "n", "d"]] = None,
|
|
25
|
+
config_or_context: Optional[Union[tiledb.Config, tiledb.Ctx]] = None,
|
|
26
|
+
validate: bool = True,
|
|
27
|
+
):
|
|
28
|
+
"""Initialize the object.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
uri:
|
|
32
|
+
URI to the array.
|
|
33
|
+
|
|
34
|
+
attr:
|
|
35
|
+
Attribute to access.
|
|
36
|
+
Defaults to "data".
|
|
37
|
+
|
|
38
|
+
mode:
|
|
39
|
+
Open the array object in read 'r', write 'w', modify
|
|
40
|
+
exclusive 'm' mode, or delete 'd' mode.
|
|
41
|
+
|
|
42
|
+
Defaults to None for automatic mode switching.
|
|
43
|
+
|
|
44
|
+
config_or_context:
|
|
45
|
+
Config or context object.
|
|
46
|
+
Defaults to None.
|
|
47
|
+
|
|
48
|
+
validate:
|
|
49
|
+
Whether to validate the attributes.
|
|
50
|
+
Defaults to True.
|
|
51
|
+
"""
|
|
52
|
+
self.uri = uri
|
|
53
|
+
self._mode = mode
|
|
54
|
+
|
|
55
|
+
if config_or_context is None:
|
|
56
|
+
config_or_context = tiledb.Config()
|
|
57
|
+
|
|
58
|
+
if isinstance(config_or_context, tiledb.Config):
|
|
59
|
+
ctx = tiledb.Ctx(config_or_context)
|
|
60
|
+
elif isinstance(config_or_context, tiledb.Ctx):
|
|
61
|
+
ctx = config_or_context
|
|
62
|
+
else:
|
|
63
|
+
raise TypeError("'config_or_context' must be either TileDB config or a context object.")
|
|
64
|
+
|
|
65
|
+
self._ctx = ctx
|
|
66
|
+
self._array = None
|
|
67
|
+
self._shape = None
|
|
68
|
+
self._ndim = None
|
|
69
|
+
self._dim_names = None
|
|
70
|
+
self._attr_names = None
|
|
71
|
+
|
|
72
|
+
if validate:
|
|
73
|
+
self._validate(attr=attr)
|
|
74
|
+
|
|
75
|
+
self._attr = attr
|
|
76
|
+
|
|
77
|
+
def _validate(self, attr):
|
|
78
|
+
with self.open_array(mode="r") as A:
|
|
79
|
+
if A.ndim > 2:
|
|
80
|
+
raise ValueError("Only 1D and 2D arrays are supported.")
|
|
81
|
+
|
|
82
|
+
if attr not in self.attr_names:
|
|
83
|
+
raise ValueError(
|
|
84
|
+
f"Attribute '{attr}' does not exist in the array. Available attributes: {self.attr_names}."
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
@property
|
|
88
|
+
def mode(self) -> Optional[str]:
|
|
89
|
+
"""Get current array mode."""
|
|
90
|
+
return self._mode
|
|
91
|
+
|
|
92
|
+
@mode.setter
|
|
93
|
+
def mode(self, value: Optional[str]):
|
|
94
|
+
"""Set array mode.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
value:
|
|
98
|
+
One of `None`, 'r', 'w', or 'm', 'd'.
|
|
99
|
+
"""
|
|
100
|
+
if value is not None and value not in ["r", "w", "m", "d"]:
|
|
101
|
+
raise ValueError("Mode must be one of: None, 'r', 'w', 'm', 'd'")
|
|
102
|
+
self._mode = value
|
|
103
|
+
|
|
104
|
+
@property
|
|
105
|
+
def dim_names(self) -> List[str]:
|
|
106
|
+
"""Get dimension names of the array."""
|
|
107
|
+
if self._dim_names is None:
|
|
108
|
+
with self.open_array(mode="r") as A:
|
|
109
|
+
self._dim_names = [dim.name for dim in A.schema.domain]
|
|
110
|
+
return self._dim_names
|
|
111
|
+
|
|
112
|
+
@property
|
|
113
|
+
def attr_names(self) -> List[str]:
|
|
114
|
+
"""Get attribute names of the array."""
|
|
115
|
+
if self._attr_names is None:
|
|
116
|
+
with self.open_array(mode="r") as A:
|
|
117
|
+
self._attr_names = [A.schema.attr(i).name for i in range(A.schema.nattr)]
|
|
118
|
+
return self._attr_names
|
|
119
|
+
|
|
120
|
+
@property
|
|
121
|
+
def shape(self) -> Tuple[int, ...]:
|
|
122
|
+
"""Get array shape from schema domain."""
|
|
123
|
+
if self._shape is None:
|
|
124
|
+
with self.open_array(mode="r") as A:
|
|
125
|
+
self._shape = tuple(int(dim.domain[1] - dim.domain[0] + 1) for dim in A.schema.domain)
|
|
126
|
+
return self._shape
|
|
127
|
+
|
|
128
|
+
@property
|
|
129
|
+
def nonempty_domain(self) -> Tuple[int, ...]:
|
|
130
|
+
"""Get array non-empty domain."""
|
|
131
|
+
if self._nonempty_domain is None:
|
|
132
|
+
with self.open_array(mode="r") as A:
|
|
133
|
+
self._nonempty_domain = A.nonempty_domain()
|
|
134
|
+
return self._nonempty_domain
|
|
135
|
+
|
|
136
|
+
@property
|
|
137
|
+
def ndim(self) -> int:
|
|
138
|
+
"""Get number of dimensions."""
|
|
139
|
+
if self._ndim is None:
|
|
140
|
+
self._ndim = len(self.shape)
|
|
141
|
+
return self._ndim
|
|
142
|
+
|
|
143
|
+
@contextmanager
|
|
144
|
+
def open_array(self, mode: Optional[str] = None):
|
|
145
|
+
"""Context manager for array operations.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
mode:
|
|
149
|
+
Override mode for this operation.
|
|
150
|
+
"""
|
|
151
|
+
mode = mode if mode is not None else self.mode
|
|
152
|
+
mode = mode if mode is not None else "r" # Default to read mode
|
|
153
|
+
|
|
154
|
+
array = tiledb.open(self.uri, mode=mode, ctx=self._ctx)
|
|
155
|
+
try:
|
|
156
|
+
yield array
|
|
157
|
+
finally:
|
|
158
|
+
array.close()
|
|
159
|
+
|
|
160
|
+
def __getitem__(self, key: Union[slice, Tuple[Union[slice, List[int]], ...]]):
|
|
161
|
+
"""Get item implementation that routes to either direct slicing or multi_index
|
|
162
|
+
based on the type of indices provided.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
key:
|
|
166
|
+
Slice or list of indices for each dimension in the array.
|
|
167
|
+
"""
|
|
168
|
+
if not isinstance(key, tuple):
|
|
169
|
+
key = (key,)
|
|
170
|
+
|
|
171
|
+
if len(key) > self.ndim:
|
|
172
|
+
raise IndexError(f"Invalid number of dimensions: got {len(key)}, expected {self.ndim}")
|
|
173
|
+
|
|
174
|
+
# Normalize all indices
|
|
175
|
+
normalized_key = tuple(SliceHelper.normalize_index(idx, self.shape[i]) for i, idx in enumerate(key))
|
|
176
|
+
|
|
177
|
+
# Check if we can use direct slicing
|
|
178
|
+
use_direct = all(isinstance(idx, slice) for idx in normalized_key)
|
|
179
|
+
|
|
180
|
+
if use_direct:
|
|
181
|
+
return self._direct_slice(normalized_key)
|
|
182
|
+
else:
|
|
183
|
+
return self._multi_index(normalized_key)
|
|
184
|
+
|
|
185
|
+
@abstractmethod
|
|
186
|
+
def _direct_slice(self, key: Tuple[slice, ...]) -> np.ndarray:
|
|
187
|
+
"""Implementation for direct slicing."""
|
|
188
|
+
pass
|
|
189
|
+
|
|
190
|
+
@abstractmethod
|
|
191
|
+
def _multi_index(self, key: Tuple[Union[slice, List[int]], ...]) -> np.ndarray:
|
|
192
|
+
"""Implementation for multi-index access."""
|
|
193
|
+
pass
|
|
194
|
+
|
|
195
|
+
def vacuum(self) -> None:
|
|
196
|
+
"""Remove deleted fragments from the array."""
|
|
197
|
+
tiledb.vacuum(self.uri)
|
|
198
|
+
|
|
199
|
+
def consolidate(self, config: Optional[ConsolidationConfig] = None) -> None:
|
|
200
|
+
"""Consolidate array fragments.
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
config:
|
|
204
|
+
Optional consolidation configuration.
|
|
205
|
+
"""
|
|
206
|
+
if config is None:
|
|
207
|
+
config = ConsolidationConfig()
|
|
208
|
+
|
|
209
|
+
consolidation_cfg = tiledb.Config()
|
|
210
|
+
|
|
211
|
+
consolidation_cfg["sm.consolidation.steps"] = config.steps
|
|
212
|
+
consolidation_cfg["sm.consolidation.step_min_frags"] = config.step_min_frags
|
|
213
|
+
consolidation_cfg["sm.consolidation.step_max_frags"] = config.step_max_frags
|
|
214
|
+
consolidation_cfg["sm.consolidation.buffer_size"] = config.buffer_size
|
|
215
|
+
consolidation_cfg["sm.mem.total_budget"] = config.total_budget
|
|
216
|
+
|
|
217
|
+
tiledb.consolidate(self.uri, config=consolidation_cfg)
|
|
218
|
+
|
|
219
|
+
if config.vacuum_after:
|
|
220
|
+
self.vacuum()
|
|
221
|
+
|
|
222
|
+
@abstractmethod
|
|
223
|
+
def write_batch(self, data: Union[np.ndarray, sparse.spmatrix], start_row: int, **kwargs) -> None:
|
|
224
|
+
"""Write a batch of data to the array starting at the specified row.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
data:
|
|
228
|
+
Data to write (numpy array for dense, scipy sparse matrix for sparse).
|
|
229
|
+
|
|
230
|
+
start_row:
|
|
231
|
+
Starting row index for writing.
|
|
232
|
+
|
|
233
|
+
**kwargs:
|
|
234
|
+
Additional arguments for write operation.
|
|
235
|
+
"""
|
|
236
|
+
pass
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
from typing import List, Tuple, Union
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
from .CellArray import CellArray
|
|
6
|
+
from .helpers import SliceHelper
|
|
7
|
+
|
|
8
|
+
__author__ = "Jayaram Kancherla"
|
|
9
|
+
__copyright__ = "Jayaram Kancherla"
|
|
10
|
+
__license__ = "MIT"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DenseCellArray(CellArray):
|
|
14
|
+
"""Implementation for dense TileDB arrays."""
|
|
15
|
+
|
|
16
|
+
def _direct_slice(self, key: Tuple[slice, ...]) -> np.ndarray:
|
|
17
|
+
"""Implementation for direct slicing of dense arrays.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
key:
|
|
21
|
+
Tuple of slice objects.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
Sliced data.
|
|
25
|
+
"""
|
|
26
|
+
with self.open_array(mode="r") as array:
|
|
27
|
+
res = array[key]
|
|
28
|
+
return res[self._attr] if self._attr is not None else res
|
|
29
|
+
|
|
30
|
+
def _multi_index(self, key: Tuple[Union[slice, List[int]], ...]) -> np.ndarray:
|
|
31
|
+
"""Implementation for multi-index access of dense arrays.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
key:
|
|
35
|
+
Tuple of slice objects or index lists.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Sliced data.
|
|
39
|
+
"""
|
|
40
|
+
# Try to optimize contiguous indices to slices
|
|
41
|
+
optimized_key = []
|
|
42
|
+
for idx in key:
|
|
43
|
+
if isinstance(idx, list):
|
|
44
|
+
slice_idx = SliceHelper.is_contiguous_indices(idx)
|
|
45
|
+
optimized_key.append(slice_idx if slice_idx is not None else idx)
|
|
46
|
+
else:
|
|
47
|
+
optimized_key.append(idx)
|
|
48
|
+
|
|
49
|
+
# If all indices are now slices, use direct slicing
|
|
50
|
+
if all(isinstance(idx, slice) for idx in optimized_key):
|
|
51
|
+
return self._direct_slice(tuple(optimized_key))
|
|
52
|
+
|
|
53
|
+
# For mixed slice-list queries, adjust slice bounds to exclude upper bound
|
|
54
|
+
tiledb_key = []
|
|
55
|
+
for idx in key:
|
|
56
|
+
if isinstance(idx, slice):
|
|
57
|
+
# Adjust stop to be exclusive by subtracting 1 if stop is not None
|
|
58
|
+
stop = None if idx.stop is None else idx.stop - 1
|
|
59
|
+
tiledb_key.append(slice(idx.start, stop, idx.step))
|
|
60
|
+
else:
|
|
61
|
+
tiledb_key.append(idx)
|
|
62
|
+
|
|
63
|
+
with self.open_array(mode="r") as array:
|
|
64
|
+
res = array.multi_index[tuple(tiledb_key)]
|
|
65
|
+
return res[self._attr] if self._attr is not None else res
|
|
66
|
+
|
|
67
|
+
def write_batch(self, data: np.ndarray, start_row: int, **kwargs) -> None:
|
|
68
|
+
"""Write a batch of data to the dense array.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
data:
|
|
72
|
+
Numpy array to write.
|
|
73
|
+
|
|
74
|
+
start_row:
|
|
75
|
+
Starting row index for writing.
|
|
76
|
+
|
|
77
|
+
**kwargs:
|
|
78
|
+
Additional arguments passed to TileDB write operation.
|
|
79
|
+
|
|
80
|
+
Raises:
|
|
81
|
+
TypeError: If input is not a numpy array.
|
|
82
|
+
ValueError: If dimensions don't match or bounds are exceeded.
|
|
83
|
+
"""
|
|
84
|
+
if not isinstance(data, np.ndarray):
|
|
85
|
+
raise TypeError("Input must be a numpy array.")
|
|
86
|
+
|
|
87
|
+
if len(data.shape) != self.ndim:
|
|
88
|
+
raise ValueError(f"Data dimensions {data.shape} don't match array dimensions {self.shape}.")
|
|
89
|
+
|
|
90
|
+
# Check bounds
|
|
91
|
+
end_row = start_row + data.shape[0]
|
|
92
|
+
if end_row > self.shape[0]:
|
|
93
|
+
raise ValueError(
|
|
94
|
+
f"Write operation would exceed array bounds. End row {end_row} > array rows {self.shape[0]}."
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
if self.ndim == 2 and data.shape[1] != self.shape[1]:
|
|
98
|
+
raise ValueError(f"Data columns {data.shape[1]} don't match array columns {self.shape[1]}.")
|
|
99
|
+
|
|
100
|
+
# Construct write region
|
|
101
|
+
if self.ndim == 1:
|
|
102
|
+
write_region = slice(start_row, end_row)
|
|
103
|
+
else: # 2D
|
|
104
|
+
write_region = (slice(start_row, end_row), slice(0, self.shape[1]))
|
|
105
|
+
|
|
106
|
+
# write_data = {self._attr: data} if len(self.attr_names) > 1 else data
|
|
107
|
+
with self.open_array(mode="w") as array:
|
|
108
|
+
array[write_region] = data
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import tiledb
|
|
5
|
+
from scipy import sparse
|
|
6
|
+
|
|
7
|
+
from .CellArray import CellArray
|
|
8
|
+
from .helpers import SliceHelper
|
|
9
|
+
|
|
10
|
+
__author__ = "Jayaram Kancherla"
|
|
11
|
+
__copyright__ = "Jayaram Kancherla"
|
|
12
|
+
__license__ = "MIT"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SparseCellArray(CellArray):
|
|
16
|
+
"""Implementation for sparse TileDB arrays."""
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
uri: str,
|
|
21
|
+
attr: str = "data",
|
|
22
|
+
mode: str = None,
|
|
23
|
+
config_or_context: Optional[Union[tiledb.Config, tiledb.Ctx]] = None,
|
|
24
|
+
return_sparse: bool = True,
|
|
25
|
+
sparse_coerce: Union[sparse.csr_matrix, sparse.csc_matrix] = sparse.csr_matrix,
|
|
26
|
+
):
|
|
27
|
+
"""Initialize SparseCellArray."""
|
|
28
|
+
super().__init__(uri, attr, mode, config_or_context)
|
|
29
|
+
|
|
30
|
+
self.return_sparse = return_sparse
|
|
31
|
+
self.sparse_coerce = sparse.csr_matrix if sparse_coerce is None else sparse_coerce
|
|
32
|
+
|
|
33
|
+
def _validate_matrix_dims(self, data: sparse.spmatrix) -> Tuple[sparse.coo_matrix, bool]:
|
|
34
|
+
"""Validate and adjust matrix dimensions if needed.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
data:
|
|
38
|
+
Input sparse matrix.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
Tuple of (adjusted matrix, is_1d flag).
|
|
42
|
+
|
|
43
|
+
Raises:
|
|
44
|
+
ValueError: If dimensions are incompatible.
|
|
45
|
+
"""
|
|
46
|
+
coo_data = data.tocoo() if not isinstance(data, sparse.coo_matrix) else data
|
|
47
|
+
|
|
48
|
+
is_1d = self.ndim == 1
|
|
49
|
+
if is_1d:
|
|
50
|
+
if coo_data.shape[0] == 1:
|
|
51
|
+
# Convert (1,N) to (N,1)
|
|
52
|
+
coo_data = sparse.coo_matrix(
|
|
53
|
+
(coo_data.data, (coo_data.col, np.zeros_like(coo_data.col))), shape=(coo_data.shape[1], 1)
|
|
54
|
+
)
|
|
55
|
+
elif coo_data.shape[1] != 1:
|
|
56
|
+
raise ValueError(f"1D array expects (N, 1) matrix, got {coo_data.shape}")
|
|
57
|
+
|
|
58
|
+
return coo_data, is_1d
|
|
59
|
+
|
|
60
|
+
def _get_slice_shape(self, key: Tuple[Union[slice, List[int]], ...]) -> Tuple[int, ...]:
|
|
61
|
+
"""Calculate shape of sliced result.
|
|
62
|
+
|
|
63
|
+
Always returns 2D shape (n,1) for csr matrix compatibility.
|
|
64
|
+
"""
|
|
65
|
+
shape = []
|
|
66
|
+
for i, idx in enumerate(key):
|
|
67
|
+
if isinstance(idx, slice):
|
|
68
|
+
shape.append(idx.stop - (idx.start or 0))
|
|
69
|
+
elif isinstance(idx, list):
|
|
70
|
+
shape.append(len(set(idx)))
|
|
71
|
+
else: # single integer
|
|
72
|
+
shape.append(1)
|
|
73
|
+
|
|
74
|
+
# Always return (n,1) shape for CSR matrix
|
|
75
|
+
if self.ndim == 1:
|
|
76
|
+
return (shape[0], 1)
|
|
77
|
+
return tuple(shape)
|
|
78
|
+
|
|
79
|
+
def _to_sparse_format(
|
|
80
|
+
self, result: Dict[str, np.ndarray], key: Tuple[Union[slice, List[int]], ...], shape: Tuple[int, ...]
|
|
81
|
+
) -> Union[np.ndarray, sparse.spmatrix]:
|
|
82
|
+
"""Convert TileDB result to CSR format or dense array."""
|
|
83
|
+
data = result[self._attr]
|
|
84
|
+
|
|
85
|
+
# empty result
|
|
86
|
+
if len(data) == 0:
|
|
87
|
+
print("is emoty")
|
|
88
|
+
if not self.return_sparse:
|
|
89
|
+
return result
|
|
90
|
+
else:
|
|
91
|
+
# For COO output, return empty sparse matrix
|
|
92
|
+
if self.ndim == 1:
|
|
93
|
+
matrix = self.sparse_coerce((1, shape[0]))
|
|
94
|
+
return matrix[:, key[0]]
|
|
95
|
+
|
|
96
|
+
return self.sparse_coerce(shape)[key]
|
|
97
|
+
|
|
98
|
+
# Get coordinates
|
|
99
|
+
coords = []
|
|
100
|
+
for dim_name in self.dim_names:
|
|
101
|
+
dim_coords = result[dim_name]
|
|
102
|
+
coords.append(dim_coords)
|
|
103
|
+
|
|
104
|
+
# For 1D arrays, add zero column coordinates, also (N, 1)
|
|
105
|
+
if self.ndim == 1:
|
|
106
|
+
coords = [np.zeros_like(coords[0]), coords[0]]
|
|
107
|
+
shape = (1, shape[0])
|
|
108
|
+
|
|
109
|
+
# Create sparse matrix
|
|
110
|
+
matrix = sparse.coo_matrix((data, tuple(coords)), shape=shape)
|
|
111
|
+
if self.sparse_coerce in (sparse.csr_matrix, sparse.csr_array):
|
|
112
|
+
sliced = matrix.tocsr()
|
|
113
|
+
elif self.sparse_coerce in (sparse.csc_matrix, sparse.csc_array):
|
|
114
|
+
sliced = matrix.tocsc()
|
|
115
|
+
|
|
116
|
+
if self.ndim == 1:
|
|
117
|
+
return sliced[:, key[0]]
|
|
118
|
+
|
|
119
|
+
return sliced[key]
|
|
120
|
+
|
|
121
|
+
def _direct_slice(self, key: Tuple[slice, ...]) -> Union[np.ndarray, sparse.coo_matrix]:
|
|
122
|
+
"""Implementation for direct slicing of sparse arrays."""
|
|
123
|
+
with self.open_array(mode="r") as array:
|
|
124
|
+
result = array[key]
|
|
125
|
+
|
|
126
|
+
if not self.return_sparse:
|
|
127
|
+
return result
|
|
128
|
+
|
|
129
|
+
return self._to_sparse_format(result, key, self.shape)
|
|
130
|
+
|
|
131
|
+
def _multi_index(self, key: Tuple[Union[slice, List[int]], ...]) -> Union[np.ndarray, sparse.coo_matrix]:
|
|
132
|
+
"""Implementation for multi-index access of sparse arrays."""
|
|
133
|
+
# Try to optimize contiguous indices to slices
|
|
134
|
+
optimized_key = []
|
|
135
|
+
for idx in key:
|
|
136
|
+
if isinstance(idx, list):
|
|
137
|
+
slice_idx = SliceHelper.is_contiguous_indices(idx)
|
|
138
|
+
optimized_key.append(slice_idx if slice_idx is not None else idx)
|
|
139
|
+
else:
|
|
140
|
+
optimized_key.append(idx)
|
|
141
|
+
|
|
142
|
+
if all(isinstance(idx, slice) for idx in optimized_key):
|
|
143
|
+
return self._direct_slice(tuple(optimized_key))
|
|
144
|
+
|
|
145
|
+
# For mixed slice-list queries, adjust slice bounds
|
|
146
|
+
tiledb_key = []
|
|
147
|
+
for idx in key:
|
|
148
|
+
if isinstance(idx, slice):
|
|
149
|
+
stop = None if idx.stop is None else idx.stop - 1
|
|
150
|
+
tiledb_key.append(slice(idx.start, stop, idx.step))
|
|
151
|
+
else:
|
|
152
|
+
tiledb_key.append(idx)
|
|
153
|
+
|
|
154
|
+
with self.open_array(mode="r") as array:
|
|
155
|
+
result = array.multi_index[tuple(tiledb_key)]
|
|
156
|
+
|
|
157
|
+
if not self.return_sparse:
|
|
158
|
+
return result
|
|
159
|
+
|
|
160
|
+
return self._to_sparse_format(result, key, self.shape)
|
|
161
|
+
|
|
162
|
+
def write_batch(
|
|
163
|
+
self, data: Union[sparse.spmatrix, sparse.csc_matrix, sparse.coo_matrix], start_row: int, **kwargs
|
|
164
|
+
) -> None:
|
|
165
|
+
"""Write a batch of sparse data to the array.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
data:
|
|
169
|
+
Scipy sparse matrix (CSR, CSC, or COO format).
|
|
170
|
+
|
|
171
|
+
start_row:
|
|
172
|
+
Starting row index for writing.
|
|
173
|
+
|
|
174
|
+
**kwargs:
|
|
175
|
+
Additional arguments passed to TileDB write operation.
|
|
176
|
+
|
|
177
|
+
Raises:
|
|
178
|
+
TypeError: If input is not a sparse matrix.
|
|
179
|
+
ValueError: If dimensions don't match or bounds are exceeded.
|
|
180
|
+
"""
|
|
181
|
+
if not sparse.issparse(data):
|
|
182
|
+
raise TypeError("Input must be a scipy sparse matrix.")
|
|
183
|
+
|
|
184
|
+
# Validate and adjust dimensions
|
|
185
|
+
data, is_1d = self._validate_matrix_dims(data)
|
|
186
|
+
|
|
187
|
+
# Check bounds
|
|
188
|
+
end_row = start_row + data.shape[0]
|
|
189
|
+
if end_row > self.shape[0]:
|
|
190
|
+
raise ValueError(
|
|
191
|
+
f"Write operation would exceed array bounds. End row {end_row} > array rows {self.shape[0]}."
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
if not is_1d and data.shape[1] != self.shape[1]:
|
|
195
|
+
raise ValueError(f"Data columns {data.shape[1]} don't match array columns {self.shape[1]}.")
|
|
196
|
+
|
|
197
|
+
adjusted_rows = data.row + start_row
|
|
198
|
+
with self.open_array(mode="w") as array:
|
|
199
|
+
if is_1d:
|
|
200
|
+
array[adjusted_rows] = data.data
|
|
201
|
+
else:
|
|
202
|
+
array[adjusted_rows, data.col] = data.data
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
|
|
3
|
+
if sys.version_info[:2] >= (3, 8):
|
|
4
|
+
# TODO: Import directly (no need for conditional) when `python_requires = >= 3.8`
|
|
5
|
+
from importlib.metadata import PackageNotFoundError, version # pragma: no cover
|
|
6
|
+
else:
|
|
7
|
+
from importlib_metadata import PackageNotFoundError, version # pragma: no cover
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
# Change here if project is renamed and does not equal the package name
|
|
11
|
+
dist_name = "cellarr-array"
|
|
12
|
+
__version__ = version(dist_name)
|
|
13
|
+
except PackageNotFoundError: # pragma: no cover
|
|
14
|
+
__version__ = "unknown"
|
|
15
|
+
finally:
|
|
16
|
+
del version, PackageNotFoundError
|
|
17
|
+
|
|
18
|
+
from .config import CellArrConfig, ConsolidationConfig
|
|
19
|
+
from .DenseCellArray import DenseCellArray
|
|
20
|
+
from .SparseCellArray import SparseCellArray
|
|
21
|
+
from .helpers import create_cellarray, SliceHelper
|
cellarr_array/config.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Any, Dict, List, Union
|
|
3
|
+
|
|
4
|
+
import tiledb
|
|
5
|
+
|
|
6
|
+
__author__ = "Jayaram Kancherla"
|
|
7
|
+
__copyright__ = "Jayaram Kancherla"
|
|
8
|
+
__license__ = "MIT"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class CellArrConfig:
|
|
13
|
+
"""Configuration class for TileDB array creation and access."""
|
|
14
|
+
|
|
15
|
+
tile_capacity: int = 100000
|
|
16
|
+
cell_order: str = "row-major"
|
|
17
|
+
tile_order: str = "row-major"
|
|
18
|
+
coords_filters: List[tiledb.Filter] = field(default_factory=lambda: [tiledb.LZ4Filter()])
|
|
19
|
+
offsets_filters: List[tiledb.Filter] = field(default_factory=lambda: [tiledb.LZ4Filter()])
|
|
20
|
+
attrs_filters: Dict[str, List[tiledb.Filter]] = field(default_factory=lambda: {"": [tiledb.LZ4Filter()]})
|
|
21
|
+
ctx_config: Dict[str, Any] = field(default_factory=dict)
|
|
22
|
+
|
|
23
|
+
@staticmethod
|
|
24
|
+
def create_filter(filter_config: Union[Dict[str, Any], tiledb.Filter]) -> tiledb.Filter:
|
|
25
|
+
"""Create a TileDB Filter object from configuration."""
|
|
26
|
+
if isinstance(filter_config, tiledb.Filter):
|
|
27
|
+
return filter_config
|
|
28
|
+
|
|
29
|
+
if isinstance(filter_config, dict):
|
|
30
|
+
filter_name = filter_config.get("name", "").lower()
|
|
31
|
+
filter_level = filter_config.get("level", None)
|
|
32
|
+
|
|
33
|
+
if filter_name == "zstd":
|
|
34
|
+
return tiledb.ZstdFilter(level=filter_level)
|
|
35
|
+
elif filter_name == "gzip":
|
|
36
|
+
return tiledb.GzipFilter(level=filter_level)
|
|
37
|
+
elif filter_name == "bzip2":
|
|
38
|
+
return tiledb.Bzip2Filter(level=filter_level)
|
|
39
|
+
elif filter_name == "double-delta":
|
|
40
|
+
return tiledb.DoubleDeltaFilter()
|
|
41
|
+
elif filter_name == "bit-width-reduction":
|
|
42
|
+
return tiledb.BitWidthReductionFilter()
|
|
43
|
+
else:
|
|
44
|
+
raise ValueError(f"Unsupported filter type: {filter_name}")
|
|
45
|
+
|
|
46
|
+
raise TypeError("Filter must be either a TileDB Filter object or a configuration dictionary")
|
|
47
|
+
|
|
48
|
+
def __post_init__(self):
|
|
49
|
+
"""Convert filter configurations to TileDB Filter objects."""
|
|
50
|
+
if not isinstance(self.coords_filters, list):
|
|
51
|
+
self.coords_filters = [self.coords_filters]
|
|
52
|
+
self.coords_filters = [self.create_filter(f) for f in self.coords_filters]
|
|
53
|
+
|
|
54
|
+
if not isinstance(self.offsets_filters, list):
|
|
55
|
+
self.offsets_filters = [self.offsets_filters]
|
|
56
|
+
self.offsets_filters = [self.create_filter(f) for f in self.offsets_filters]
|
|
57
|
+
|
|
58
|
+
for attr, filters in self.attrs_filters.items():
|
|
59
|
+
if not isinstance(filters, list):
|
|
60
|
+
filters = [filters]
|
|
61
|
+
self.attrs_filters[attr] = [self.create_filter(f) for f in filters]
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@dataclass
|
|
65
|
+
class ConsolidationConfig:
|
|
66
|
+
"""Configuration for array consolidation."""
|
|
67
|
+
|
|
68
|
+
steps: int = 100000
|
|
69
|
+
step_min_frags: int = 2
|
|
70
|
+
step_max_frags: int = 10
|
|
71
|
+
buffer_size: int = 15000000000 # 15GB
|
|
72
|
+
total_budget: int = 40000000000 # 40GB
|
|
73
|
+
num_threads: int = 4
|
|
74
|
+
vacuum_after: bool = True
|
cellarr_array/helpers.py
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
from typing import List, Optional, Tuple, Union
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import tiledb
|
|
5
|
+
|
|
6
|
+
from .config import CellArrConfig
|
|
7
|
+
|
|
8
|
+
__author__ = "Jayaram Kancherla"
|
|
9
|
+
__copyright__ = "Jayaram Kancherla"
|
|
10
|
+
__license__ = "MIT"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def create_cellarray(
|
|
14
|
+
uri: str,
|
|
15
|
+
shape: Optional[Tuple[Optional[int], ...]] = None,
|
|
16
|
+
attr_dtype: Optional[Union[str, np.dtype]] = None,
|
|
17
|
+
sparse: bool = False,
|
|
18
|
+
mode: str = None,
|
|
19
|
+
config: Optional[CellArrConfig] = None,
|
|
20
|
+
dim_names: Optional[List[str]] = None,
|
|
21
|
+
dim_dtypes: Optional[List[Union[str, np.dtype]]] = None,
|
|
22
|
+
attr_name: str = "data",
|
|
23
|
+
**kwargs,
|
|
24
|
+
):
|
|
25
|
+
"""Factory function to create a new TileDB cell array.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
uri:
|
|
29
|
+
Array URI.
|
|
30
|
+
|
|
31
|
+
shape:
|
|
32
|
+
Optional array shape. If None or contains None, uses dtype max.
|
|
33
|
+
|
|
34
|
+
attr_dtype:
|
|
35
|
+
Data type for the attribute. Defaults to float32.
|
|
36
|
+
|
|
37
|
+
sparse:
|
|
38
|
+
Whether to create a sparse array.
|
|
39
|
+
|
|
40
|
+
mode:
|
|
41
|
+
Array open mode. Defaults to None for automatic switching.
|
|
42
|
+
|
|
43
|
+
config:
|
|
44
|
+
Optional configuration.
|
|
45
|
+
|
|
46
|
+
dim_names:
|
|
47
|
+
Optional list of dimension names.
|
|
48
|
+
|
|
49
|
+
dim_dtypes:
|
|
50
|
+
Optional list of dimension dtypes.
|
|
51
|
+
|
|
52
|
+
attr_name:
|
|
53
|
+
Name of the data attribute.
|
|
54
|
+
|
|
55
|
+
**kwargs:
|
|
56
|
+
Additional arguments for array creation.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
CellArray instance.
|
|
60
|
+
|
|
61
|
+
Raises:
|
|
62
|
+
ValueError: If dimensions are invalid or inputs are inconsistent.
|
|
63
|
+
"""
|
|
64
|
+
config = config or CellArrConfig()
|
|
65
|
+
|
|
66
|
+
if attr_dtype is None:
|
|
67
|
+
attr_dtype = np.float32
|
|
68
|
+
if isinstance(attr_dtype, str):
|
|
69
|
+
attr_dtype = np.dtype(attr_dtype)
|
|
70
|
+
|
|
71
|
+
# Require either shape or dim_dtypes
|
|
72
|
+
if shape is None and dim_dtypes is None:
|
|
73
|
+
raise ValueError("Either 'shape' or 'dim_dtypes' must be provided.")
|
|
74
|
+
|
|
75
|
+
if shape is not None:
|
|
76
|
+
if len(shape) not in (1, 2):
|
|
77
|
+
raise ValueError("Only 1D and 2D arrays are supported.")
|
|
78
|
+
|
|
79
|
+
# Set dimension dtypes, defaults to numpy uint32
|
|
80
|
+
if dim_dtypes is None:
|
|
81
|
+
dim_dtypes = [np.uint32] * len(shape)
|
|
82
|
+
else:
|
|
83
|
+
if len(dim_dtypes) not in (1, 2):
|
|
84
|
+
raise ValueError("Only 1D and 2D arrays are supported.")
|
|
85
|
+
dim_dtypes = [np.dtype(dt) if isinstance(dt, str) else dt for dt in dim_dtypes]
|
|
86
|
+
|
|
87
|
+
# Calculate shape from dtypes if needed
|
|
88
|
+
if shape is None:
|
|
89
|
+
shape = tuple(np.iinfo(dt).max if np.issubdtype(dt, np.integer) else None for dt in dim_dtypes)
|
|
90
|
+
if None in shape:
|
|
91
|
+
shape = tuple(
|
|
92
|
+
np.iinfo(dt).max if s is None and np.issubdtype(dt, np.integer) else s for s, dt in zip(shape, dim_dtypes)
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Set dimension names
|
|
96
|
+
if dim_names is None:
|
|
97
|
+
dim_names = [f"dim_{i}" for i in range(len(shape))]
|
|
98
|
+
|
|
99
|
+
# Validate all input lengths
|
|
100
|
+
if not (len(shape) == len(dim_dtypes) == len(dim_names)):
|
|
101
|
+
raise ValueError("Lengths of 'shape', 'dim_dtypes', and 'dim_names' must match.")
|
|
102
|
+
|
|
103
|
+
dom = tiledb.Domain(
|
|
104
|
+
*[
|
|
105
|
+
tiledb.Dim(name=name, domain=(0, s - 1), tile=min(s, config.tile_capacity), dtype=dt)
|
|
106
|
+
for name, s, dt in zip(dim_names, shape, dim_dtypes)
|
|
107
|
+
],
|
|
108
|
+
ctx=tiledb.Ctx(config.ctx_config),
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
attr = tiledb.Attr(
|
|
112
|
+
name=attr_name,
|
|
113
|
+
dtype=attr_dtype,
|
|
114
|
+
filters=config.attrs_filters.get(attr_name, config.attrs_filters.get("", None)),
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
schema = tiledb.ArraySchema(
|
|
118
|
+
domain=dom,
|
|
119
|
+
attrs=[attr],
|
|
120
|
+
cell_order=config.cell_order,
|
|
121
|
+
tile_order=config.tile_order,
|
|
122
|
+
sparse=sparse,
|
|
123
|
+
coords_filters=config.coords_filters,
|
|
124
|
+
offsets_filters=config.offsets_filters,
|
|
125
|
+
ctx=tiledb.Ctx(config.ctx_config),
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
tiledb.Array.create(uri, schema)
|
|
129
|
+
|
|
130
|
+
# Import here to avoid circular imports
|
|
131
|
+
from .DenseCellArray import DenseCellArray
|
|
132
|
+
from .SparseCellArray import SparseCellArray
|
|
133
|
+
|
|
134
|
+
# Return appropriate array type
|
|
135
|
+
return SparseCellArray(uri, attr=attr_name, mode=mode) if sparse else DenseCellArray(uri, attr=attr_name, mode=mode)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
class SliceHelper:
|
|
139
|
+
"""Helper class for handling array slicing operations."""
|
|
140
|
+
|
|
141
|
+
@staticmethod
|
|
142
|
+
def is_contiguous_indices(indices: List[int]) -> Optional[slice]:
|
|
143
|
+
"""Check if indices can be represented as a contiguous slice."""
|
|
144
|
+
if not indices:
|
|
145
|
+
return None
|
|
146
|
+
|
|
147
|
+
diffs = np.diff(indices)
|
|
148
|
+
if np.all(diffs == 1):
|
|
149
|
+
return slice(indices[0], indices[-1] + 1, None)
|
|
150
|
+
return None
|
|
151
|
+
|
|
152
|
+
@staticmethod
|
|
153
|
+
def normalize_index(idx: Union[int, slice, List[int]], dim_size: int) -> Union[slice, List[int]]:
|
|
154
|
+
"""Normalize index to handle negative indices and ensure consistency."""
|
|
155
|
+
|
|
156
|
+
# Convert ranges to slices
|
|
157
|
+
if isinstance(idx, range):
|
|
158
|
+
idx = slice(idx.start, idx.stop, idx.step)
|
|
159
|
+
|
|
160
|
+
if isinstance(idx, slice):
|
|
161
|
+
start = idx.start if idx.start is not None else 0
|
|
162
|
+
stop = idx.stop if idx.stop is not None else dim_size
|
|
163
|
+
step = idx.step
|
|
164
|
+
|
|
165
|
+
# Handle negative indices
|
|
166
|
+
if start < 0:
|
|
167
|
+
start = dim_size + start
|
|
168
|
+
|
|
169
|
+
if stop < 0:
|
|
170
|
+
stop = dim_size + stop
|
|
171
|
+
|
|
172
|
+
if start < 0 or start > dim_size:
|
|
173
|
+
raise IndexError(f"Start index {start} out of bounds for dimension size {dim_size}")
|
|
174
|
+
if stop < 0 or stop > dim_size:
|
|
175
|
+
raise IndexError(f"Stop index {stop} out of bounds for dimension size {dim_size}")
|
|
176
|
+
|
|
177
|
+
return slice(start, stop, step)
|
|
178
|
+
|
|
179
|
+
elif isinstance(idx, list):
|
|
180
|
+
norm_idx = [i if i >= 0 else dim_size + i for i in idx]
|
|
181
|
+
if any(i < 0 or i >= dim_size for i in norm_idx):
|
|
182
|
+
raise IndexError(f"List indices {idx} out of bounds for dimension size {dim_size}")
|
|
183
|
+
return norm_idx
|
|
184
|
+
|
|
185
|
+
else: # Single integer index
|
|
186
|
+
norm_idx = idx if idx >= 0 else dim_size + idx
|
|
187
|
+
|
|
188
|
+
if norm_idx < 0 or norm_idx >= dim_size:
|
|
189
|
+
raise IndexError(f"Index {idx} out of bounds for dimension size {dim_size}")
|
|
190
|
+
return slice(norm_idx, norm_idx + 1, None)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def create_group(output_path, group_name):
|
|
194
|
+
tiledb.group_create(f"{output_path}/{group_name}")
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Jayaram Kancherla
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: cellarr-array
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Base class for handling TileDB backed arrays.
|
|
5
|
+
Home-page: https://github.com/cellarr/cellarr-array
|
|
6
|
+
Author: Jayaram Kancherla
|
|
7
|
+
Author-email: jayaram.kancherla@gmail.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Project-URL: Documentation, https://github.com/cellarr/cellarr-array
|
|
10
|
+
Platform: any
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Programming Language :: Python
|
|
13
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
14
|
+
License-File: LICENSE.txt
|
|
15
|
+
Requires-Dist: importlib-metadata; python_version < "3.8"
|
|
16
|
+
Requires-Dist: tiledb
|
|
17
|
+
Requires-Dist: numpy
|
|
18
|
+
Requires-Dist: scipy
|
|
19
|
+
Provides-Extra: testing
|
|
20
|
+
Requires-Dist: setuptools; extra == "testing"
|
|
21
|
+
Requires-Dist: pytest; extra == "testing"
|
|
22
|
+
Requires-Dist: pytest-cov; extra == "testing"
|
|
23
|
+
|
|
24
|
+
[](https://pypi.org/project/cellarr-array/)
|
|
25
|
+

|
|
26
|
+
|
|
27
|
+
# cellarr-array
|
|
28
|
+
|
|
29
|
+
This package provided high-level wrappers for TileDB arrays optimized for handling genomic data matrices.
|
|
30
|
+
|
|
31
|
+
## Install
|
|
32
|
+
|
|
33
|
+
To get started, install the package from [PyPI](https://pypi.org/project/cellarr-array/)
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install cellarr-array
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Quick Start
|
|
40
|
+
|
|
41
|
+
### Creating Arrays
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
import numpy as np
|
|
45
|
+
from scipy import sparse
|
|
46
|
+
from cellarr_array import create_cellarray, CellArrConfig
|
|
47
|
+
|
|
48
|
+
# Create a dense 2D array
|
|
49
|
+
dense_array = create_cellarray(
|
|
50
|
+
uri="dense_matrix.tdb",
|
|
51
|
+
shape=(10000, 5000),
|
|
52
|
+
attr_dtype=np.float32,
|
|
53
|
+
sparse=False,
|
|
54
|
+
dim_names=["cells", "genes"]
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
# Create a sparse 2D array with custom compression
|
|
58
|
+
config = CellArrConfig(
|
|
59
|
+
tile_capacity=1000,
|
|
60
|
+
attrs_filters={"data": [{"name": "zstd", "level": 7}]}
|
|
61
|
+
)
|
|
62
|
+
sparse_array = create_cellarray(
|
|
63
|
+
uri="sparse_matrix.tdb",
|
|
64
|
+
shape=(10000, 5000),
|
|
65
|
+
attr_dtype=np.float32,
|
|
66
|
+
sparse=True,
|
|
67
|
+
config=config,
|
|
68
|
+
dim_names=["cells", "genes"]
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
# Create a 1D array
|
|
72
|
+
array_1d = create_cellarray(
|
|
73
|
+
uri="vector.tdb",
|
|
74
|
+
shape=(1000,),
|
|
75
|
+
attr_dtype=np.float32,
|
|
76
|
+
sparse=False
|
|
77
|
+
)
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### Writing Data
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
# Writing to dense arrays
|
|
84
|
+
data = np.random.random((1000, 5000)).astype(np.float32)
|
|
85
|
+
dense_array.write_batch(data, start_row=0)
|
|
86
|
+
|
|
87
|
+
# Writing to sparse arrays
|
|
88
|
+
sparse_data = sparse.random(1000, 5000, density=0.1, format="csr", dtype=np.float32)
|
|
89
|
+
sparse_array.write_batch(sparse_data, start_row=0)
|
|
90
|
+
|
|
91
|
+
# Writing to 1D arrays
|
|
92
|
+
data_1d = np.random.random(100).astype(np.float32)
|
|
93
|
+
array_1d.write_batch(data_1d, start_row=0)
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Reading Data
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
# Slicing operations (similar to NumPy)
|
|
100
|
+
|
|
101
|
+
# Full slice
|
|
102
|
+
full_data = dense_array[:]
|
|
103
|
+
|
|
104
|
+
# Partial slice
|
|
105
|
+
subset = dense_array[100:200, 1000:2000]
|
|
106
|
+
|
|
107
|
+
# Using lists of indices
|
|
108
|
+
cells = [10, 20, 30]
|
|
109
|
+
genes = [5, 15, 25]
|
|
110
|
+
subset = dense_array[cells, genes]
|
|
111
|
+
|
|
112
|
+
# Mixed slicing
|
|
113
|
+
subset = dense_array[100:200, genes]
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
### Working with Sparse Arrays
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
# Create a sparse array with COO output format
|
|
120
|
+
coo_array = SparseCellArray(
|
|
121
|
+
uri="sparse_matrix.tdb",
|
|
122
|
+
return_coo=True
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# Get result as COO matrix
|
|
126
|
+
result = coo_array[100:200, 500:1000]
|
|
127
|
+
|
|
128
|
+
# Result is scipy.sparse.coo_matrix
|
|
129
|
+
assert sparse.isspmatrix_coo(result)
|
|
130
|
+
|
|
131
|
+
# Perform sparse operations
|
|
132
|
+
nnz = result.nnz
|
|
133
|
+
density = result.nnz / (result.shape[0] * result.shape[1])
|
|
134
|
+
|
|
135
|
+
# Convert to other sparse formats if needed
|
|
136
|
+
result_csr = result.tocsr()
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### Array Maintenance
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
# Consolidate fragments
|
|
143
|
+
array.consolidate()
|
|
144
|
+
|
|
145
|
+
# Custom consolidation
|
|
146
|
+
config = ConsolidationConfig(
|
|
147
|
+
steps=["fragment"],
|
|
148
|
+
vacuum_after=True
|
|
149
|
+
)
|
|
150
|
+
array.consolidate(config)
|
|
151
|
+
|
|
152
|
+
# Vacuum
|
|
153
|
+
array.vacuum()
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
<!-- biocsetup-notes -->
|
|
157
|
+
|
|
158
|
+
## Note
|
|
159
|
+
|
|
160
|
+
This project has been set up using [BiocSetup](https://github.com/biocpy/biocsetup)
|
|
161
|
+
and [PyScaffold](https://pyscaffold.org/).
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
cellarr_array/CellArray.py,sha256=vOaq-0FbVKeuS31992oc_N5IOBXclcVkczPNIbua5Ws,7498
|
|
2
|
+
cellarr_array/DenseCellArray.py,sha256=iPrjFtGolnHB0BTi4A8ncEpoFI9FWe6oZHhA1Men3Wo,3745
|
|
3
|
+
cellarr_array/SparseCellArray.py,sha256=8bajVOvUMaQhWU-_pZY0Cg9sD6kWRAJCu2G45uY-W4Q,7096
|
|
4
|
+
cellarr_array/__init__.py,sha256=8m0_shRPKNNaNab5tGBL2l0K5XgkKCFuLAh7QGogfYo,778
|
|
5
|
+
cellarr_array/config.py,sha256=67zBxpYY9N_v6TMdyljUIZmckbwOBcuLC99aJooGmfA,2917
|
|
6
|
+
cellarr_array/helpers.py,sha256=O0RgDLIdYbWc01yp2Cw0EmjJ3g_uzlz2JnYE8W7PZEE,6182
|
|
7
|
+
cellarr_array-0.0.1.dist-info/LICENSE.txt,sha256=qI2hRZobcUlj8gqFqXwqt522HeYyWvHLF00zCSZofHA,1084
|
|
8
|
+
cellarr_array-0.0.1.dist-info/METADATA,sha256=UaSorFB0-5KuhVrM8pvdGuN98WQ6iSLgUUH6MtpJwXM,3747
|
|
9
|
+
cellarr_array-0.0.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
10
|
+
cellarr_array-0.0.1.dist-info/top_level.txt,sha256=oErp0D8ABZV-QPtTiXT8_F2z36Ic7ykuDg_1Y84HLZM,14
|
|
11
|
+
cellarr_array-0.0.1.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
cellarr_array
|