cellarr-array 0.0.3__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cellarr-array might be problematic. Click here for more details.

@@ -0,0 +1,167 @@
1
+ import shutil
2
+ from typing import Dict, Optional
3
+
4
+ import numpy as np
5
+ import scipy.sparse as sp
6
+ import tiledb
7
+
8
+ from ..core import DenseCellArray, SparseCellArray
9
+ from ..core.helpers import CellArrConfig, create_cellarray
10
+
11
+ __author__ = "Jayaram Kancherla"
12
+ __copyright__ = "Jayaram Kancherla"
13
+ __license__ = "MIT"
14
+
15
+
16
+ def generate_tiledb_dense_array(
17
+ uri: str,
18
+ rows: int,
19
+ cols: int,
20
+ attr_name: str = "data",
21
+ attr_dtype: np.dtype = np.float32,
22
+ chunk_size: int = 1000,
23
+ tiledb_config: Optional[Dict] = None,
24
+ ):
25
+ """Generates a dense TileDB array and fills it with random float32 data.
26
+
27
+ Args:
28
+ uri:
29
+ URI for the new TileDB array.
30
+
31
+ rows:
32
+ Number of rows.
33
+
34
+ cols:
35
+ Number of columns (features).
36
+
37
+ attr_name:
38
+ Name of the attribute.
39
+
40
+ attr_dtype:
41
+ Data type of the attribute.
42
+
43
+ chunk_size:
44
+ Number of rows to write per batch.
45
+
46
+ tiledb_config:
47
+ TileDB context configuration.
48
+ """
49
+ if tiledb.array_exists(uri):
50
+ print(f"Array {uri} already exists. Removing.")
51
+ shutil.rmtree(uri)
52
+
53
+ print(f"Creating dense array at '{uri}' with shape ({rows}, {cols})")
54
+ cfg = CellArrConfig(ctx_config=tiledb_config if tiledb_config else {})
55
+
56
+ create_cellarray(
57
+ uri=uri,
58
+ shape=(rows, cols),
59
+ attr_dtype=attr_dtype,
60
+ sparse=False,
61
+ dim_names=["rows", "cols"],
62
+ attr_name=attr_name,
63
+ # config=cfg
64
+ )
65
+
66
+ ctx = tiledb.Ctx(cfg.ctx_config) if cfg.ctx_config else None
67
+ arr_writer = DenseCellArray(uri=uri, attr=attr_name, mode="w", config_or_context=ctx)
68
+
69
+ print("shape of writer", arr_writer.shape)
70
+
71
+ print(f"Writing data to dense array '{uri}'...")
72
+ for i in range(0, rows, chunk_size):
73
+ end_row = min(i + chunk_size, rows)
74
+ num_chunk_rows = end_row - i
75
+ data_chunk = np.random.rand(num_chunk_rows, cols).astype(attr_dtype)
76
+ print(i, end_row, num_chunk_rows, data_chunk.shape)
77
+ arr_writer.write_batch(data_chunk, start_row=i)
78
+ if (i // chunk_size) % 10 == 0:
79
+ print(f" Dense write: {end_row}/{rows} rows written.")
80
+
81
+ print(f"Finished writing to dense array '{uri}'.")
82
+
83
+
84
+ def generate_tiledb_sparse_array(
85
+ uri: str,
86
+ rows: int,
87
+ cols: int,
88
+ density: float = 0.01,
89
+ attr_name: str = "data",
90
+ attr_dtype: np.dtype = np.float32,
91
+ chunk_size: int = 1000,
92
+ tiledb_config: Optional[Dict] = None,
93
+ sparse_format_to_write="coo",
94
+ ):
95
+ """Generates a sparse TileDB array and fills it with random float32 data.
96
+
97
+ Args:
98
+ uri:
99
+ URI for the new TileDB array.
100
+
101
+ rows:
102
+ Number of rows.
103
+
104
+ cols:
105
+ Number of columns (features).
106
+
107
+ density:
108
+ Density of the sparse matrix.
109
+
110
+ attr_name:
111
+ Name of the attribute.
112
+
113
+ attr_dtype:
114
+ Data type of the attribute.
115
+
116
+ chunk_size:
117
+ Number of rows to generate and write per batch.
118
+
119
+ tiledb_configs:
120
+ TileDB context configuration.
121
+
122
+ sparse_format_to_write:
123
+ Scipy sparse format to use for generating chunks ('coo', 'csr', 'csc').
124
+
125
+ """
126
+ if tiledb.array_exists(uri):
127
+ print(f"Array {uri} already exists. Removing.")
128
+ shutil.rmtree(uri)
129
+
130
+ print(f"Creating sparse array at '{uri}' with shape ({rows}, {cols}), density ~{density}")
131
+ cfg = CellArrConfig(ctx_config=tiledb_config if tiledb_config else {})
132
+ create_cellarray(
133
+ uri=uri,
134
+ shape=(rows, cols),
135
+ attr_dtype=attr_dtype,
136
+ sparse=True,
137
+ dim_names=["rows", "cols"],
138
+ attr_name=attr_name,
139
+ # config=cfg
140
+ )
141
+
142
+ ctx = tiledb.Ctx(cfg.ctx_config) if cfg.ctx_config else None
143
+ arr_writer = SparseCellArray(
144
+ uri=uri,
145
+ attr=attr_name,
146
+ mode="w",
147
+ config_or_context=ctx,
148
+ )
149
+
150
+ print(f"Writing data to sparse array '{uri}'...")
151
+ for i in range(0, rows, chunk_size):
152
+ end_row = min(i + chunk_size, rows)
153
+ num_chunk_rows = end_row - i
154
+ if num_chunk_rows <= 0:
155
+ continue
156
+
157
+ data_chunk_scipy = sp.random(
158
+ num_chunk_rows, cols, density=density, format=sparse_format_to_write, dtype=attr_dtype
159
+ )
160
+
161
+ if data_chunk_scipy.nnz > 0:
162
+ arr_writer.write_batch(data_chunk_scipy, start_row=i)
163
+
164
+ if (i // chunk_size) % 10 == 0:
165
+ print(f" Sparse write: {end_row}/{rows} rows processed for writing.")
166
+
167
+ print(f"Finished writing to sparse array '{uri}'.")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cellarr-array
3
- Version: 0.0.3
3
+ Version: 0.2.0
4
4
  Summary: Base class for handling TileDB backed arrays.
5
5
  Home-page: https://github.com/cellarr/cellarr-array
6
6
  Author: Jayaram Kancherla
@@ -16,10 +16,13 @@ Requires-Dist: importlib-metadata; python_version < "3.8"
16
16
  Requires-Dist: tiledb
17
17
  Requires-Dist: numpy
18
18
  Requires-Dist: scipy
19
+ Provides-Extra: optional
20
+ Requires-Dist: torch; extra == "optional"
19
21
  Provides-Extra: testing
20
22
  Requires-Dist: setuptools; extra == "testing"
21
23
  Requires-Dist: pytest; extra == "testing"
22
24
  Requires-Dist: pytest-cov; extra == "testing"
25
+ Requires-Dist: torch; extra == "testing"
23
26
  Dynamic: license-file
24
27
 
25
28
  [![PyPI-Server](https://img.shields.io/pypi/v/cellarr-array.svg)](https://pypi.org/project/cellarr-array/)
@@ -0,0 +1,19 @@
1
+ cellarr_array/__init__.py,sha256=coBnoCq1_cv6FnnbowNt6wEIDfVl2GlGTkjnveP-8C4,707
2
+ cellarr_array/core/__init__.py,sha256=fvM-FEiDn8TKDbHxhhzp9FXZFNovFwvIUSY6SpLQRdk,98
3
+ cellarr_array/core/base.py,sha256=3FlhzZSh4ePz3Zm_dU8XNXJ6xgs7rKGi5HgCVWJLhXY,13458
4
+ cellarr_array/core/dense.py,sha256=LODRH4utpKs8xhT79Q2-nRiam_s68_a0qPj0unEM7rg,3940
5
+ cellarr_array/core/helpers.py,sha256=Z_2zRUULFTm7Lo9EpkGvIeRraP6XNDRB-o3rh9ChKQQ,7856
6
+ cellarr_array/core/sparse.py,sha256=XifIWhbTRAQ6qL096th-dCkqscNRwFZuTd7uaRf9aGM,8844
7
+ cellarr_array/dataloaders/__init__.py,sha256=U-MfwC2K84OIXT75in41fe_wvoxjUC5Krb5zICQn_O8,245
8
+ cellarr_array/dataloaders/denseloader.py,sha256=JYJlbuX5My64iIPW_-nlPFkNIezxL3Z3mkwInS3hH9M,7291
9
+ cellarr_array/dataloaders/iterabledataloader.py,sha256=lR2T1YatyBlDM5Sy_75B7_8ORiWfn3cp4q48Oujwf-c,11916
10
+ cellarr_array/dataloaders/sparseloader.py,sha256=V_eKw-Z_CNxHP8c2BN3sOuuv6RPiWBzRfW1BYLhNaQc,7962
11
+ cellarr_array/dataloaders/utils.py,sha256=buJ87x1YBTt5-nZoy_I5j6ko1lVlHdiGpQCusdLoRLI,600
12
+ cellarr_array/utils/__init__.py,sha256=DM5jeUMbxbRzTu2QCjpLlrTQ5uionF887S_7i6_952U,177
13
+ cellarr_array/utils/config.py,sha256=67zBxpYY9N_v6TMdyljUIZmckbwOBcuLC99aJooGmfA,2917
14
+ cellarr_array/utils/mock.py,sha256=7GyCbtM7u94pm7qhjsPRSO2IWYLmd4UrjyvLnQtMMkc,4579
15
+ cellarr_array-0.2.0.dist-info/licenses/LICENSE.txt,sha256=JUlHIfWcRe_MZop18pQvMIPLKSSPz3XQ06ASHuW5Wh8,1076
16
+ cellarr_array-0.2.0.dist-info/METADATA,sha256=NbNM3Gyu4t2f1odp26QiUutfic0IdnagSCnJUn9NLSs,4228
17
+ cellarr_array-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
18
+ cellarr_array-0.2.0.dist-info/top_level.txt,sha256=oErp0D8ABZV-QPtTiXT8_F2z36Ic7ykuDg_1Y84HLZM,14
19
+ cellarr_array-0.2.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (78.1.0)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,6 +1,6 @@
1
1
  The MIT License (MIT)
2
2
 
3
- Copyright (c) 2025 Jayaram Kancherla
3
+ Copyright (c) 2025 Genentech
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
@@ -1,251 +0,0 @@
1
- from abc import ABC, abstractmethod
2
- from contextlib import contextmanager
3
-
4
- try:
5
- from types import EllipsisType
6
- except ImportError:
7
- # TODO: This is required for Python <3.10. Remove once Python 3.9 reaches EOL in October 2025
8
- EllipsisType = type(...)
9
- from typing import List, Literal, Optional, Tuple, Union
10
-
11
- import numpy as np
12
- import tiledb
13
- from scipy import sparse
14
-
15
- from .config import ConsolidationConfig
16
- from .helpers import SliceHelper
17
-
18
- __author__ = "Jayaram Kancherla"
19
- __copyright__ = "Jayaram Kancherla"
20
- __license__ = "MIT"
21
-
22
-
23
- class CellArray(ABC):
24
- """Abstract base class for TileDB array operations."""
25
-
26
- def __init__(
27
- self,
28
- uri: str,
29
- attr: str = "data",
30
- mode: Optional[Literal["r", "w", "n", "d"]] = None,
31
- config_or_context: Optional[Union[tiledb.Config, tiledb.Ctx]] = None,
32
- validate: bool = True,
33
- ):
34
- """Initialize the object.
35
-
36
- Args:
37
- uri:
38
- URI to the array.
39
-
40
- attr:
41
- Attribute to access.
42
- Defaults to "data".
43
-
44
- mode:
45
- Open the array object in read 'r', write 'w', modify
46
- exclusive 'm' mode, or delete 'd' mode.
47
-
48
- Defaults to None for automatic mode switching.
49
-
50
- config_or_context:
51
- Optional config or context object.
52
-
53
- Defaults to None.
54
-
55
- validate:
56
- Whether to validate the attributes.
57
- Defaults to True.
58
- """
59
- self.uri = uri
60
- self._mode = mode
61
-
62
- if config_or_context is None:
63
- # config_or_context = tiledb.Config()
64
- ctx = None
65
- else:
66
- if isinstance(config_or_context, tiledb.Config):
67
- ctx = tiledb.Ctx(config_or_context)
68
- elif isinstance(config_or_context, tiledb.Ctx):
69
- ctx = config_or_context
70
- else:
71
- raise TypeError("'config_or_context' must be either TileDB config or a context object.")
72
-
73
- self._ctx = ctx
74
- self._array = None
75
- self._shape = None
76
- self._ndim = None
77
- self._dim_names = None
78
- self._attr_names = None
79
- self._nonempty_domain = None
80
-
81
- if validate:
82
- self._validate(attr=attr)
83
-
84
- self._attr = attr
85
-
86
- def _validate(self, attr):
87
- with self.open_array(mode="r") as A:
88
- if A.ndim > 2:
89
- raise ValueError("Only 1D and 2D arrays are supported.")
90
-
91
- if attr not in self.attr_names:
92
- raise ValueError(
93
- f"Attribute '{attr}' does not exist in the array. Available attributes: {self.attr_names}."
94
- )
95
-
96
- @property
97
- def mode(self) -> Optional[str]:
98
- """Get current array mode."""
99
- return self._mode
100
-
101
- @mode.setter
102
- def mode(self, value: Optional[str]):
103
- """Set array mode.
104
-
105
- Args:
106
- value:
107
- One of `None`, 'r', 'w', or 'm', 'd'.
108
- """
109
- if value is not None and value not in ["r", "w", "m", "d"]:
110
- raise ValueError("Mode must be one of: None, 'r', 'w', 'm', 'd'")
111
- self._mode = value
112
-
113
- @property
114
- def dim_names(self) -> List[str]:
115
- """Get dimension names of the array."""
116
- if self._dim_names is None:
117
- with self.open_array(mode="r") as A:
118
- self._dim_names = [dim.name for dim in A.schema.domain]
119
- return self._dim_names
120
-
121
- @property
122
- def attr_names(self) -> List[str]:
123
- """Get attribute names of the array."""
124
- if self._attr_names is None:
125
- with self.open_array(mode="r") as A:
126
- self._attr_names = [A.schema.attr(i).name for i in range(A.schema.nattr)]
127
- return self._attr_names
128
-
129
- @property
130
- def shape(self) -> Tuple[int, ...]:
131
- """Get array shape from schema domain."""
132
- if self._shape is None:
133
- with self.open_array(mode="r") as A:
134
- self._shape = tuple(int(dim.domain[1] - dim.domain[0] + 1) for dim in A.schema.domain)
135
- return self._shape
136
-
137
- @property
138
- def nonempty_domain(self) -> Tuple[int, ...]:
139
- """Get array non-empty domain."""
140
- if self._nonempty_domain is None:
141
- with self.open_array(mode="r") as A:
142
- self._nonempty_domain = A.nonempty_domain()
143
- return self._nonempty_domain
144
-
145
- @property
146
- def ndim(self) -> int:
147
- """Get number of dimensions."""
148
- if self._ndim is None:
149
- self._ndim = len(self.shape)
150
- return self._ndim
151
-
152
- @contextmanager
153
- def open_array(self, mode: Optional[str] = None):
154
- """Context manager for array operations.
155
-
156
- Args:
157
- mode:
158
- Override mode for this operation.
159
- """
160
- mode = mode if mode is not None else self.mode
161
- mode = mode if mode is not None else "r" # Default to read mode
162
-
163
- array = tiledb.open(self.uri, mode=mode, ctx=self._ctx)
164
- try:
165
- yield array
166
- finally:
167
- array.close()
168
-
169
- def __getitem__(self, key: Union[slice, EllipsisType, Tuple[Union[slice, List[int]], ...], EllipsisType]):
170
- """Get item implementation that routes to either direct slicing or multi_index
171
- based on the type of indices provided.
172
-
173
- Args:
174
- key:
175
- Slice or list of indices for each dimension in the array.
176
- """
177
- if not isinstance(key, tuple):
178
- key = (key,)
179
-
180
- if len(key) > self.ndim:
181
- raise IndexError(f"Invalid number of dimensions: got {len(key)}, expected {self.ndim}")
182
-
183
- # Normalize all indices
184
- normalized_key = tuple(SliceHelper.normalize_index(idx, self.shape[i]) for i, idx in enumerate(key))
185
-
186
- num_ellipsis = sum(isinstance(i, EllipsisType) for i in normalized_key)
187
- if num_ellipsis > 1:
188
- raise IndexError(f"Found more than 1 Ellipsis (...) in key: {normalized_key}")
189
-
190
- # Check if we can use direct slicing
191
- use_direct = all(isinstance(idx, (slice, EllipsisType)) for idx in normalized_key)
192
-
193
- if use_direct:
194
- return self._direct_slice(normalized_key)
195
- else:
196
- if num_ellipsis > 0:
197
- raise IndexError(f"tiledb does not support ellipsis in multi-index access: {normalized_key}")
198
- return self._multi_index(normalized_key)
199
-
200
- @abstractmethod
201
- def _direct_slice(self, key: Tuple[Union[slice, EllipsisType], ...]) -> np.ndarray:
202
- """Implementation for direct slicing."""
203
- pass
204
-
205
- @abstractmethod
206
- def _multi_index(self, key: Tuple[Union[slice, List[int]], ...]) -> np.ndarray:
207
- """Implementation for multi-index access."""
208
- pass
209
-
210
- def vacuum(self) -> None:
211
- """Remove deleted fragments from the array."""
212
- tiledb.vacuum(self.uri)
213
-
214
- def consolidate(self, config: Optional[ConsolidationConfig] = None) -> None:
215
- """Consolidate array fragments.
216
-
217
- Args:
218
- config:
219
- Optional consolidation configuration.
220
- """
221
- if config is None:
222
- config = ConsolidationConfig()
223
-
224
- consolidation_cfg = tiledb.Config()
225
-
226
- consolidation_cfg["sm.consolidation.steps"] = config.steps
227
- consolidation_cfg["sm.consolidation.step_min_frags"] = config.step_min_frags
228
- consolidation_cfg["sm.consolidation.step_max_frags"] = config.step_max_frags
229
- consolidation_cfg["sm.consolidation.buffer_size"] = config.buffer_size
230
- consolidation_cfg["sm.mem.total_budget"] = config.total_budget
231
-
232
- tiledb.consolidate(self.uri, config=consolidation_cfg)
233
-
234
- if config.vacuum_after:
235
- self.vacuum()
236
-
237
- @abstractmethod
238
- def write_batch(self, data: Union[np.ndarray, sparse.spmatrix], start_row: int, **kwargs) -> None:
239
- """Write a batch of data to the array starting at the specified row.
240
-
241
- Args:
242
- data:
243
- Data to write (numpy array for dense, scipy sparse matrix for sparse).
244
-
245
- start_row:
246
- Starting row index for writing.
247
-
248
- **kwargs:
249
- Additional arguments for write operation.
250
- """
251
- pass
@@ -1,11 +0,0 @@
1
- cellarr_array/CellArray.py,sha256=vc_6oDLCpVgUaP8HsQz4vE0ZyJ1SPdX43s7VQyh7gF0,8204
2
- cellarr_array/DenseCellArray.py,sha256=rlu2xq8SONwIswqe0TzRNCwM5f0HYgxr4QBtvbBe8ro,3953
3
- cellarr_array/SparseCellArray.py,sha256=cOIbs_97j5u13FU7FfEfRNqAZi8rHUkypgLgRcubXrU,7304
4
- cellarr_array/__init__.py,sha256=IUE9wMDISgRkWp-Fc0KJpDiezCJ61kzuTqS9HdK-JeE,779
5
- cellarr_array/config.py,sha256=67zBxpYY9N_v6TMdyljUIZmckbwOBcuLC99aJooGmfA,2917
6
- cellarr_array/helpers.py,sha256=ZqK_josEzKzTMP62P9pb4qBiOTisFofTCnu1LETYJT4,6449
7
- cellarr_array-0.0.3.dist-info/licenses/LICENSE.txt,sha256=qI2hRZobcUlj8gqFqXwqt522HeYyWvHLF00zCSZofHA,1084
8
- cellarr_array-0.0.3.dist-info/METADATA,sha256=1KgSZEEF2i9aCr4mkkyAmuPXht4y5ZG2-YdH4dBELpQ,4120
9
- cellarr_array-0.0.3.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
10
- cellarr_array-0.0.3.dist-info/top_level.txt,sha256=oErp0D8ABZV-QPtTiXT8_F2z36Ic7ykuDg_1Y84HLZM,14
11
- cellarr_array-0.0.3.dist-info/RECORD,,
File without changes