cellarr-array 0.1.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cellarr-array might be problematic. Click here for more details.

Files changed (57) hide show
  1. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/.github/workflows/publish-pypi.yml +2 -2
  2. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/.github/workflows/run-tests.yml +1 -1
  3. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/.gitignore +2 -0
  4. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/.pre-commit-config.yaml +1 -1
  5. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/CHANGELOG.md +13 -0
  6. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/PKG-INFO +6 -1
  7. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/setup.cfg +5 -0
  8. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/src/cellarr_array/__init__.py +3 -4
  9. cellarr_array-0.3.0/src/cellarr_array/core/__init__.py +3 -0
  10. cellarr_array-0.1.0/src/cellarr_array/cellarray_base.py → cellarr_array-0.3.0/src/cellarr_array/core/base.py +50 -3
  11. cellarr_array-0.1.0/src/cellarr_array/cellarray_dense.py → cellarr_array-0.3.0/src/cellarr_array/core/dense.py +2 -3
  12. {cellarr_array-0.1.0/src/cellarr_array → cellarr_array-0.3.0/src/cellarr_array/core}/helpers.py +101 -51
  13. cellarr_array-0.1.0/src/cellarr_array/cellarray_sparse.py → cellarr_array-0.3.0/src/cellarr_array/core/sparse.py +11 -17
  14. cellarr_array-0.3.0/src/cellarr_array/dataloaders/__init__.py +3 -0
  15. cellarr_array-0.3.0/src/cellarr_array/dataloaders/denseloader.py +198 -0
  16. cellarr_array-0.3.0/src/cellarr_array/dataloaders/iterabledataloader.py +320 -0
  17. cellarr_array-0.3.0/src/cellarr_array/dataloaders/sparseloader.py +230 -0
  18. cellarr_array-0.3.0/src/cellarr_array/dataloaders/utils.py +26 -0
  19. cellarr_array-0.3.0/src/cellarr_array/utils/__init__.py +3 -0
  20. cellarr_array-0.3.0/src/cellarr_array/utils/mock.py +167 -0
  21. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/src/cellarr_array.egg-info/PKG-INFO +6 -1
  22. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/src/cellarr_array.egg-info/SOURCES.txt +18 -6
  23. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/src/cellarr_array.egg-info/requires.txt +5 -0
  24. cellarr_array-0.3.0/tests/conftest.py +233 -0
  25. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/tests/test_all.py +1 -1
  26. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/tests/test_dense.py +5 -5
  27. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/tests/test_helpers.py +25 -13
  28. cellarr_array-0.3.0/tests/test_iterable_loader.py +288 -0
  29. cellarr_array-0.3.0/tests/test_map_loader.py +289 -0
  30. cellarr_array-0.3.0/tests/test_query.py +63 -0
  31. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/tests/test_sparse.py +1 -1
  32. cellarr_array-0.3.0/tests/test_string_dims.py +73 -0
  33. cellarr_array-0.1.0/tests/conftest.py +0 -91
  34. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/.coveragerc +0 -0
  35. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/.readthedocs.yml +0 -0
  36. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/AUTHORS.md +0 -0
  37. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/CONTRIBUTING.md +0 -0
  38. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/LICENSE.txt +0 -0
  39. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/README.md +0 -0
  40. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/docs/Makefile +0 -0
  41. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/docs/_static/.gitignore +0 -0
  42. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/docs/authors.md +0 -0
  43. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/docs/changelog.md +0 -0
  44. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/docs/conf.py +0 -0
  45. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/docs/contributing.md +0 -0
  46. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/docs/index.md +0 -0
  47. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/docs/license.md +0 -0
  48. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/docs/readme.md +0 -0
  49. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/docs/requirements.txt +0 -0
  50. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/pyproject.toml +0 -0
  51. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/setup.py +0 -0
  52. {cellarr_array-0.1.0/src/cellarr_array → cellarr_array-0.3.0/src/cellarr_array/utils}/config.py +0 -0
  53. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/src/cellarr_array.egg-info/dependency_links.txt +0 -0
  54. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/src/cellarr_array.egg-info/not-zip-safe +0 -0
  55. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/src/cellarr_array.egg-info/top_level.txt +0 -0
  56. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/tests/test_inmemory.py +0 -0
  57. {cellarr_array-0.1.0 → cellarr_array-0.3.0}/tox.ini +0 -0
@@ -19,10 +19,10 @@ jobs:
19
19
  steps:
20
20
  - uses: actions/checkout@v4
21
21
 
22
- - name: Set up Python 3.11
22
+ - name: Set up Python 3.12
23
23
  uses: actions/setup-python@v5
24
24
  with:
25
- python-version: 3.11
25
+ python-version: 3.12
26
26
 
27
27
  - name: Install dependencies
28
28
  run: |
@@ -28,7 +28,7 @@ jobs:
28
28
  test:
29
29
  strategy:
30
30
  matrix:
31
- python: ["3.9", "3.10", "3.11", "3.12", "3.13"]
31
+ python: ["3.10", "3.11", "3.12", "3.13", "3.14"]
32
32
  platform:
33
33
  - ubuntu-latest
34
34
  # - macos-latest
@@ -52,3 +52,5 @@ MANIFEST
52
52
  .venv*/
53
53
  .conda*/
54
54
  .python-version
55
+
56
+ *.tdb
@@ -19,7 +19,7 @@ repos:
19
19
 
20
20
  - repo: https://github.com/astral-sh/ruff-pre-commit
21
21
  # Ruff version.
22
- rev: v0.11.5
22
+ rev: v0.12.1
23
23
  hooks:
24
24
  - id: ruff
25
25
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,5 +1,18 @@
1
1
  # Changelog
2
2
 
3
+ ## Version 0.3.0
4
+
5
+ - Support for string dimensions when creating cellarr arrays.
6
+ - Support query conditions for slice operations.
7
+ - Added unique dim values. Only supported for sparse arrays.
8
+ - EOL for Python 3.9
9
+
10
+ ## Version 0.2.0
11
+
12
+ - Dataloaders for sparse and dense arrays, We provide templates for both map and Iterable style dataloaders. Users are expected the caveats of both of these approaches.
13
+ - Fixed a bug with slicing on 1D arrays and many improvements for optimizing slicing parameters.
14
+ - Update documentation and tests.
15
+
3
16
  ## Version 0.1.0
4
17
 
5
18
  - Support cellarr-arrays on user provided tiledb array objects.
@@ -1,12 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cellarr-array
3
- Version: 0.1.0
3
+ Version: 0.3.0
4
4
  Summary: Base class for handling TileDB backed arrays.
5
5
  Home-page: https://github.com/cellarr/cellarr-array
6
6
  Author: Jayaram Kancherla
7
7
  Author-email: jayaram.kancherla@gmail.com
8
8
  License: MIT
9
9
  Project-URL: Documentation, https://github.com/cellarr/cellarr-array
10
+ Project-URL: Source, https://github.com/cellarr/cellarr-array
10
11
  Platform: any
11
12
  Classifier: Development Status :: 4 - Beta
12
13
  Classifier: Programming Language :: Python
@@ -16,10 +17,14 @@ Requires-Dist: importlib-metadata; python_version < "3.8"
16
17
  Requires-Dist: tiledb
17
18
  Requires-Dist: numpy
18
19
  Requires-Dist: scipy
20
+ Provides-Extra: optional
21
+ Requires-Dist: torch; extra == "optional"
19
22
  Provides-Extra: testing
20
23
  Requires-Dist: setuptools; extra == "testing"
21
24
  Requires-Dist: pytest; extra == "testing"
22
25
  Requires-Dist: pytest-cov; extra == "testing"
26
+ Requires-Dist: pandas; extra == "testing"
27
+ Requires-Dist: torch; extra == "testing"
23
28
  Dynamic: license-file
24
29
 
25
30
  [![PyPI-Server](https://img.shields.io/pypi/v/cellarr-array.svg)](https://pypi.org/project/cellarr-array/)
@@ -10,6 +10,7 @@ long_description_content_type = text/markdown; charset=UTF-8; variant=GFM
10
10
  url = https://github.com/cellarr/cellarr-array
11
11
  project_urls =
12
12
  Documentation = https://github.com/cellarr/cellarr-array
13
+ Source = https://github.com/cellarr/cellarr-array
13
14
  platforms = any
14
15
  classifiers =
15
16
  Development Status :: 4 - Beta
@@ -33,10 +34,14 @@ exclude =
33
34
  tests
34
35
 
35
36
  [options.extras_require]
37
+ optional =
38
+ torch
36
39
  testing =
37
40
  setuptools
38
41
  pytest
39
42
  pytest-cov
43
+ pandas
44
+ %(optional)s
40
45
 
41
46
  [options.entry_points]
42
47
 
@@ -15,7 +15,6 @@ except PackageNotFoundError: # pragma: no cover
15
15
  finally:
16
16
  del version, PackageNotFoundError
17
17
 
18
- from .config import CellArrConfig, ConsolidationConfig
19
- from .cellarray_dense import DenseCellArray
20
- from .cellarray_sparse import SparseCellArray
21
- from .helpers import create_cellarray, SliceHelper
18
+ from .core import DenseCellArray, SparseCellArray
19
+ from .core.helpers import create_cellarray
20
+ from .utils import CellArrConfig, ConsolidationConfig
@@ -0,0 +1,3 @@
1
+ from .base import CellArray
2
+ from .dense import DenseCellArray
3
+ from .sparse import SparseCellArray
@@ -12,7 +12,7 @@ import numpy as np
12
12
  import tiledb
13
13
  from scipy import sparse
14
14
 
15
- from .config import ConsolidationConfig
15
+ from ..utils.config import ConsolidationConfig
16
16
  from .helpers import SliceHelper
17
17
 
18
18
  __author__ = "Jayaram Kancherla"
@@ -69,6 +69,7 @@ class CellArray(ABC):
69
69
  self._array_passed_in = False
70
70
  self._opened_array_external = None
71
71
  self._ctx = None
72
+ self._dim_dtypes = None
72
73
 
73
74
  if tiledb_array_obj is not None:
74
75
  if not isinstance(tiledb_array_obj, tiledb.Array):
@@ -185,7 +186,16 @@ class CellArray(ABC):
185
186
  def shape(self) -> Tuple[int, ...]:
186
187
  if self._shape is None:
187
188
  with self.open_array(mode="r") as A:
188
- self._shape = tuple(int(dim.domain[1] - dim.domain[0] + 1) for dim in A.schema.domain)
189
+ shape_list = []
190
+ for dim in A.schema.domain:
191
+ try:
192
+ # This will fail for string dimensions
193
+ shape_list.append(dim.shape[0])
194
+ except TypeError:
195
+ # For string dimensions, the shape is not well-defined.
196
+ # We use a large number as a placeholder for slicing purposes.
197
+ shape_list.append(2**63 - 1)
198
+ self._shape = tuple(shape_list)
189
199
  return self._shape
190
200
 
191
201
  @property
@@ -209,6 +219,14 @@ class CellArray(ABC):
209
219
  # self._ndim = len(self.shape)
210
220
  return self._ndim
211
221
 
222
+ @property
223
+ def dim_dtypes(self) -> List[np.dtype]:
224
+ """Get dimension dtypes of the array."""
225
+ if self._dim_dtypes is None:
226
+ with self.open_array(mode="r") as A:
227
+ self._dim_dtypes = [dim.dtype for dim in A.schema.domain]
228
+ return self._dim_dtypes
229
+
212
230
  @contextmanager
213
231
  def open_array(self, mode: Optional[str] = None):
214
232
  """Context manager for array operations.
@@ -266,15 +284,30 @@ class CellArray(ABC):
266
284
  Args:
267
285
  key:
268
286
  Slice or list of indices for each dimension in the array.
287
+
288
+ Alternatively, may be string to specify query conditions.
269
289
  """
290
+ # This is a query condition
291
+ if isinstance(key, str):
292
+ with self.open_array(mode="r") as array:
293
+ if self._attr is not None:
294
+ return array.query(cond=key, attrs=[self._attr])[:]
295
+ else:
296
+ array.query(cond=key)[:]
297
+
270
298
  if not isinstance(key, tuple):
271
299
  key = (key,)
272
300
 
273
301
  if len(key) > self.ndim:
274
302
  raise IndexError(f"Invalid number of dimensions: got {len(key)}, expected {self.ndim}")
275
303
 
304
+ if len(key) < self.ndim:
305
+ key = key + (slice(None),) * (self.ndim - len(key))
306
+
276
307
  # Normalize all indices
277
- normalized_key = tuple(SliceHelper.normalize_index(idx, self.shape[i]) for i, idx in enumerate(key))
308
+ normalized_key = tuple(
309
+ SliceHelper.normalize_index(idx, self.shape[i], self.dim_dtypes[i]) for i, idx in enumerate(key)
310
+ )
278
311
 
279
312
  num_ellipsis = sum(isinstance(i, EllipsisType) for i in normalized_key)
280
313
  if num_ellipsis > 1:
@@ -342,3 +375,17 @@ class CellArray(ABC):
342
375
  Additional arguments for write operation.
343
376
  """
344
377
  pass
378
+
379
+ def get_unique_dim_values(self, dim_name: Optional[str] = None) -> np.ndarray:
380
+ """Get unique values for a dimension.
381
+
382
+ Args:
383
+ dim_name:
384
+ The name of the dimension. If None, unique values for all
385
+ dimensions are returned.
386
+
387
+ Returns:
388
+ An array of unique dimension values.
389
+ """
390
+ with self.open_array(mode="r") as A:
391
+ return A.unique_dim_values(dim_name)
@@ -7,7 +7,7 @@ from typing import List, Tuple, Union
7
7
 
8
8
  import numpy as np
9
9
 
10
- from .cellarray_base import CellArray
10
+ from .base import CellArray
11
11
  from .helpers import SliceHelper
12
12
 
13
13
  __author__ = "Jayaram Kancherla"
@@ -92,7 +92,6 @@ class DenseCellArray(CellArray):
92
92
  if len(data.shape) != self.ndim:
93
93
  raise ValueError(f"Data dimensions {data.shape} don't match array dimensions {self.shape}.")
94
94
 
95
- # Check bounds
96
95
  end_row = start_row + data.shape[0]
97
96
  if end_row > self.shape[0]:
98
97
  raise ValueError(
@@ -102,7 +101,6 @@ class DenseCellArray(CellArray):
102
101
  if self.ndim == 2 and data.shape[1] != self.shape[1]:
103
102
  raise ValueError(f"Data columns {data.shape[1]} don't match array columns {self.shape[1]}.")
104
103
 
105
- # Construct write region
106
104
  if self.ndim == 1:
107
105
  write_region = slice(start_row, end_row)
108
106
  else: # 2D
@@ -110,4 +108,5 @@ class DenseCellArray(CellArray):
110
108
 
111
109
  # write_data = {self._attr: data} if len(self.attr_names) > 1 else data
112
110
  with self.open_array(mode="w") as array:
111
+ print("write_region", write_region)
113
112
  array[write_region] = data
@@ -8,7 +8,7 @@ from typing import List, Optional, Tuple, Union
8
8
  import numpy as np
9
9
  import tiledb
10
10
 
11
- from .config import CellArrConfig
11
+ from ..utils.config import CellArrConfig
12
12
 
13
13
  __author__ = "Jayaram Kancherla"
14
14
  __copyright__ = "Jayaram Kancherla"
@@ -52,7 +52,7 @@ def create_cellarray(
52
52
  Optional list of dimension names.
53
53
 
54
54
  dim_dtypes:
55
- Optional list of dimension dtypes.
55
+ Optional list of dimension dtypes. Defaults to numpy's uint32.
56
56
 
57
57
  attr_name:
58
58
  Name of the data attribute.
@@ -67,29 +67,28 @@ def create_cellarray(
67
67
  ValueError: If dimensions are invalid or inputs are inconsistent.
68
68
  """
69
69
  config = config or CellArrConfig()
70
+ tiledb_ctx = tiledb.Config(config.ctx_config) if config.ctx_config else None
70
71
 
71
72
  if attr_dtype is None:
72
73
  attr_dtype = np.float32
73
74
  if isinstance(attr_dtype, str):
74
75
  attr_dtype = np.dtype(attr_dtype)
75
76
 
76
- # Require either shape or dim_dtypes
77
77
  if shape is None and dim_dtypes is None:
78
78
  raise ValueError("Either 'shape' or 'dim_dtypes' must be provided.")
79
79
 
80
80
  if shape is not None:
81
81
  if len(shape) not in (1, 2):
82
- raise ValueError("Only 1D and 2D arrays are supported.")
82
+ raise ValueError("Shape must have 1 or 2 dimensions.")
83
83
 
84
84
  # Set dimension dtypes, defaults to numpy uint32
85
85
  if dim_dtypes is None:
86
86
  dim_dtypes = [np.uint32] * len(shape)
87
87
  else:
88
88
  if len(dim_dtypes) not in (1, 2):
89
- raise ValueError("Only 1D and 2D arrays are supported.")
89
+ raise ValueError("Array must have 1 or 2 dimensions.")
90
90
  dim_dtypes = [np.dtype(dt) if isinstance(dt, str) else dt for dt in dim_dtypes]
91
91
 
92
- # Calculate shape from dtypes if needed
93
92
  if shape is None:
94
93
  shape = tuple(np.iinfo(dt).max if np.issubdtype(dt, np.integer) else None for dt in dim_dtypes)
95
94
  if None in shape:
@@ -97,7 +96,6 @@ def create_cellarray(
97
96
  np.iinfo(dt).max if s is None and np.issubdtype(dt, np.integer) else s for s, dt in zip(shape, dim_dtypes)
98
97
  )
99
98
 
100
- # Set dimension names
101
99
  if dim_names is None:
102
100
  dim_names = [f"dim_{i}" for i in range(len(shape))]
103
101
 
@@ -105,42 +103,53 @@ def create_cellarray(
105
103
  if not (len(shape) == len(dim_dtypes) == len(dim_names)):
106
104
  raise ValueError("Lengths of 'shape', 'dim_dtypes', and 'dim_names' must match.")
107
105
 
108
- dom = tiledb.Domain(
109
- *[
110
- tiledb.Dim(name=name, domain=(0, s - 1), tile=min(s, config.tile_capacity), dtype=dt)
111
- for name, s, dt in zip(dim_names, shape, dim_dtypes)
112
- ],
113
- ctx=tiledb.Ctx(config.ctx_config),
114
- )
106
+ dims = []
107
+ for name, s, dt in zip(dim_names, shape, dim_dtypes):
108
+ if np.issubdtype(dt, np.integer):
109
+ domain = (0, 0 if s == 0 else s - 1)
110
+ tile = min(1 if s == 0 else s // 2, config.tile_capacity // 2)
111
+ dim_dtype = dt
112
+ else: # Assumes string or object dtype
113
+ domain = (None, None)
114
+ tile = None
115
+ dim_dtype = "ascii"
116
+
117
+ dims.append(
118
+ tiledb.Dim(
119
+ name=name,
120
+ domain=domain,
121
+ tile=tile,
122
+ dtype=dim_dtype,
123
+ )
124
+ )
115
125
 
116
- attr = tiledb.Attr(
126
+ dom = tiledb.Domain(*dims, ctx=tiledb_ctx)
127
+ attr_obj = tiledb.Attr(
117
128
  name=attr_name,
118
129
  dtype=attr_dtype,
119
130
  filters=config.attrs_filters.get(attr_name, config.attrs_filters.get("", None)),
131
+ ctx=tiledb_ctx,
120
132
  )
121
-
122
133
  schema = tiledb.ArraySchema(
123
134
  domain=dom,
124
- attrs=[attr],
135
+ attrs=[attr_obj],
125
136
  cell_order=config.cell_order,
126
137
  tile_order=config.tile_order,
127
138
  sparse=sparse,
128
139
  coords_filters=config.coords_filters,
129
140
  offsets_filters=config.offsets_filters,
130
- ctx=tiledb.Ctx(config.ctx_config),
141
+ ctx=tiledb_ctx,
131
142
  )
132
-
133
- tiledb.Array.create(uri, schema)
143
+ tiledb.Array.create(uri, schema, ctx=tiledb_ctx)
134
144
 
135
145
  # Import here to avoid circular imports
136
- from .cellarray_dense import DenseCellArray
137
- from .cellarray_sparse import SparseCellArray
146
+ from .dense import DenseCellArray
147
+ from .sparse import SparseCellArray
138
148
 
139
- # Return appropriate array type
140
149
  return (
141
- SparseCellArray(uri=uri, attr=attr_name, mode=mode)
150
+ SparseCellArray(uri=uri, attr=attr_name, mode=mode, config_or_context=tiledb_ctx)
142
151
  if sparse
143
- else DenseCellArray(uri=uri, attr=attr_name, mode=mode)
152
+ else DenseCellArray(uri=uri, attr=attr_name, mode=mode, config_or_context=tiledb_ctx)
144
153
  )
145
154
 
146
155
 
@@ -148,59 +157,100 @@ class SliceHelper:
148
157
  """Helper class for handling array slicing operations."""
149
158
 
150
159
  @staticmethod
151
- def is_contiguous_indices(indices: List[int]) -> Optional[slice]:
152
- """Check if indices can be represented as a contiguous slice."""
160
+ def is_contiguous_indices(indices: List) -> Optional[slice]:
161
+ """Checks if a list of indices is contiguous and can be converted to a slice.
162
+
163
+ Returns None if the list is not contiguous or contains non-integers.
164
+ """
153
165
  if not indices:
154
166
  return None
155
167
 
156
- diffs = np.diff(indices)
168
+ if not all(isinstance(i, (int, np.integer)) for i in indices):
169
+ return None
170
+
171
+ sorted_indices = sorted(list(set(indices)))
172
+ if not sorted_indices:
173
+ return None
174
+
175
+ if len(sorted_indices) == 1:
176
+ return slice(sorted_indices[0], sorted_indices[0] + 1, None)
177
+
178
+ diffs = np.diff(sorted_indices)
157
179
  if np.all(diffs == 1):
158
- return slice(indices[0], indices[-1] + 1, None)
180
+ return slice(sorted_indices[0], sorted_indices[-1] + 1, None)
181
+
159
182
  return None
160
183
 
161
184
  @staticmethod
162
- def normalize_index(idx: Union[int, slice, List[int]], dim_size: int) -> Union[slice, List[int], EllipsisType]:
185
+ def normalize_index(
186
+ idx: Union[int, range, slice, List, str, EllipsisType],
187
+ dim_size: int,
188
+ dim_dtype: np.dtype,
189
+ ):
163
190
  """Normalize index to handle negative indices and ensure consistency."""
191
+ is_string_dim = np.issubdtype(dim_dtype, np.str_) or np.issubdtype(dim_dtype, np.bytes_)
192
+
193
+ if is_string_dim:
194
+ if isinstance(idx, (str, bytes)):
195
+ return [idx]
196
+ if isinstance(idx, list) and all(isinstance(i, (str, bytes)) for i in idx):
197
+ return idx
198
+ if isinstance(idx, slice):
199
+ # For string dimensions, we do not normalize the slice with integer sizes
200
+ return idx
201
+ if isinstance(idx, EllipsisType):
202
+ return idx
203
+ raise TypeError(f"Unsupported index type '{type(idx).__name__}' for string dimension.")
164
204
 
165
205
  if isinstance(idx, EllipsisType):
166
206
  return idx
167
207
 
168
- # Convert ranges to slices
169
208
  if isinstance(idx, range):
170
209
  idx = slice(idx.start, idx.stop, idx.step)
171
210
 
172
211
  if isinstance(idx, slice):
173
- start = idx.start if idx.start is not None else 0
174
- stop = idx.stop if idx.stop is not None else dim_size
175
- step = idx.step
212
+ start, stop, step = idx.start, idx.stop, idx.step
213
+
214
+ # Resolve None to full dimension slice parts
215
+ if start is None:
216
+ start = 0
217
+
218
+ if stop is None:
219
+ stop = dim_size
176
220
 
177
221
  # Handle negative indices
178
222
  if start < 0:
179
- start = dim_size + start
180
-
223
+ start += dim_size
181
224
  if stop < 0:
182
- stop = dim_size + stop
183
-
184
- if start < 0 or start > dim_size:
185
- raise IndexError(f"Start index {start} out of bounds for dimension size {dim_size}")
186
- if stop < 0 or stop > dim_size:
187
- raise IndexError(f"Stop index {stop} out of bounds for dimension size {dim_size}")
225
+ stop += dim_size
188
226
 
227
+ # Clamping slice arguments to dimensions
228
+ stop = min(stop, dim_size)
229
+ start = max(0, start)
189
230
  return slice(start, stop, step)
190
231
 
191
- elif isinstance(idx, list):
232
+ if isinstance(idx, list):
233
+ if not idx:
234
+ return []
235
+ # This check only applies to integer lists
236
+ if not all(isinstance(i, (int, np.integer)) for i in idx):
237
+ raise TypeError("List indices must be integers for numeric dimensions.")
238
+
192
239
  norm_idx = [i if i >= 0 else dim_size + i for i in idx]
193
240
  if any(i < 0 or i >= dim_size for i in norm_idx):
194
- raise IndexError(f"List indices {idx} out of bounds for dimension size {dim_size}")
195
- return norm_idx
196
-
197
- else: # Single integer index
198
- norm_idx = idx if idx >= 0 else dim_size + idx
199
-
200
- if norm_idx < 0 or norm_idx >= dim_size:
201
- raise IndexError(f"Index {idx} out of bounds for dimension size {dim_size}")
241
+ raise IndexError("List indices out of bounds for dimension size.")
242
+ return sorted(list(set(norm_idx)))
243
+
244
+ if isinstance(idx, (int, np.integer)):
245
+ norm_idx = int(idx)
246
+ if norm_idx < 0:
247
+ norm_idx += dim_size
248
+ if not (0 <= norm_idx < dim_size):
249
+ raise IndexError(f"Index {idx} out of bounds for dimension size.")
202
250
  return slice(norm_idx, norm_idx + 1, None)
203
251
 
252
+ raise TypeError(f"Index type {type(idx)} not supported for normalization.")
253
+
204
254
 
205
255
  def create_group(output_path, group_name):
206
256
  tiledb.group_create(f"{output_path}/{group_name}")
@@ -9,8 +9,8 @@ import numpy as np
9
9
  import tiledb
10
10
  from scipy import sparse
11
11
 
12
- from .cellarray_base import CellArray
13
12
  from .helpers import SliceHelper
13
+ from .base import CellArray
14
14
 
15
15
  __author__ = "Jayaram Kancherla"
16
16
  __copyright__ = "Jayaram Kancherla"
@@ -28,7 +28,7 @@ class SparseCellArray(CellArray):
28
28
  mode: Optional[Literal["r", "w", "d", "m"]] = None,
29
29
  config_or_context: Optional[Union[tiledb.Config, tiledb.Ctx]] = None,
30
30
  return_sparse: bool = True,
31
- sparse_coerce: Union[sparse.csr_matrix, sparse.csc_matrix] = sparse.csr_matrix,
31
+ sparse_format: Union[sparse.csr_matrix, sparse.csc_matrix] = sparse.csr_matrix,
32
32
  validate: bool = True,
33
33
  **kwargs,
34
34
  ):
@@ -66,7 +66,7 @@ class SparseCellArray(CellArray):
66
66
  Whether to return a sparse representation of the data when object is sliced.
67
67
  Default is to return a dictionary that contains coordinates and values.
68
68
 
69
- sparse_coerce:
69
+ sparse_format:
70
70
  Format to return, defaults to csr_matrix.
71
71
 
72
72
  validate:
@@ -86,7 +86,7 @@ class SparseCellArray(CellArray):
86
86
  )
87
87
 
88
88
  self.return_sparse = return_sparse
89
- self.sparse_coerce = sparse.csr_matrix if sparse_coerce is None else sparse_coerce
89
+ self.sparse_format = sparse.csr_matrix if sparse_format is None else sparse_format
90
90
 
91
91
  def _validate_matrix_dims(self, data: sparse.spmatrix) -> Tuple[sparse.coo_matrix, bool]:
92
92
  """Validate and adjust matrix dimensions if needed.
@@ -126,7 +126,7 @@ class SparseCellArray(CellArray):
126
126
  shape.append(idx.stop - (idx.start or 0))
127
127
  elif isinstance(idx, list):
128
128
  shape.append(len(set(idx)))
129
- else: # single integer
129
+ else:
130
130
  shape.append(1)
131
131
 
132
132
  # Always return (n,1) shape for CSR matrix
@@ -140,20 +140,16 @@ class SparseCellArray(CellArray):
140
140
  """Convert TileDB result to CSR format or dense array."""
141
141
  data = result[self._attr]
142
142
 
143
- # empty result
144
143
  if len(data) == 0:
145
- print("is emoty")
146
144
  if not self.return_sparse:
147
145
  return result
148
146
  else:
149
- # For COO output, return empty sparse matrix
150
147
  if self.ndim == 1:
151
- matrix = self.sparse_coerce((1, shape[0]))
148
+ matrix = self.sparse_format((1, shape[0]))
152
149
  return matrix[:, key[0]]
153
150
 
154
- return self.sparse_coerce(shape)[key]
151
+ return self.sparse_format(shape)[key]
155
152
 
156
- # Get coordinates
157
153
  coords = []
158
154
  for dim_name in self.dim_names:
159
155
  dim_coords = result[dim_name]
@@ -164,11 +160,12 @@ class SparseCellArray(CellArray):
164
160
  coords = [np.zeros_like(coords[0]), coords[0]]
165
161
  shape = (1, shape[0])
166
162
 
167
- # Create sparse matrix
168
163
  matrix = sparse.coo_matrix((data, tuple(coords)), shape=shape)
169
- if self.sparse_coerce in (sparse.csr_matrix, sparse.csr_array):
164
+
165
+ sliced = matrix
166
+ if self.sparse_format in (sparse.csr_matrix, sparse.csr_array):
170
167
  sliced = matrix.tocsr()
171
- elif self.sparse_coerce in (sparse.csc_matrix, sparse.csc_array):
168
+ elif self.sparse_format in (sparse.csc_matrix, sparse.csc_array):
172
169
  sliced = matrix.tocsc()
173
170
 
174
171
  if self.ndim == 1:
@@ -200,7 +197,6 @@ class SparseCellArray(CellArray):
200
197
  if all(isinstance(idx, slice) for idx in optimized_key):
201
198
  return self._direct_slice(tuple(optimized_key))
202
199
 
203
- # For mixed slice-list queries, adjust slice bounds
204
200
  tiledb_key = []
205
201
  for idx in key:
206
202
  if isinstance(idx, slice):
@@ -239,10 +235,8 @@ class SparseCellArray(CellArray):
239
235
  if not sparse.issparse(data):
240
236
  raise TypeError("Input must be a scipy sparse matrix.")
241
237
 
242
- # Validate and adjust dimensions
243
238
  coo_data, is_1d = self._validate_matrix_dims(data)
244
239
 
245
- # Check bounds
246
240
  end_row = start_row + coo_data.shape[0]
247
241
  if end_row > self.shape[0]:
248
242
  raise ValueError(
@@ -0,0 +1,3 @@
1
+ from .denseloader import DenseArrayDataset, construct_dense_array_dataloader
2
+ from .iterabledataloader import CellArrayIterableDataset, construct_iterable_dataloader
3
+ from .sparseloader import SparseArrayDataset, construct_sparse_array_dataloader