PyPI - cellarr-array - Versions diffs - 0.1.0__tar.gz → 0.3.1__tar.gz - Mend

cellarr-array 0.1.0tar.gz → 0.3.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

{cellarr_array-0.1.0 → cellarr_array-0.3.1}/.github/workflows/publish-pypi.yml RENAMED Viewed

@@ -19,19 +19,19 @@ jobs:
     steps:
       - uses: actions/checkout@v4
-      - name: Set up Python 3.11
+      - name: Set up Python 3.12
         uses: actions/setup-python@v5
         with:
-          python-version: 3.11
+          python-version: 3.12
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
           pip install tox
-      - name: Test with tox
-        run: |
-          tox
+      # - name: Test with tox
+      #   run: |
+      #     tox
       - name: Build docs
         run: |

{cellarr_array-0.1.0 → cellarr_array-0.3.1}/.github/workflows/run-tests.yml RENAMED Viewed

@@ -28,7 +28,7 @@ jobs:
   test:
     strategy:
       matrix:
-        python: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+        python: ["3.10", "3.11", "3.12", "3.13", "3.14"]
         platform:
           - ubuntu-latest
           # - macos-latest

{cellarr_array-0.1.0 → cellarr_array-0.3.1}/.gitignore RENAMED Viewed

@@ -52,3 +52,5 @@ MANIFEST
 .venv*/
 .conda*/
 .python-version
+*.tdb

{cellarr_array-0.1.0 → cellarr_array-0.3.1}/.pre-commit-config.yaml RENAMED Viewed

@@ -2,7 +2,7 @@ exclude: '^docs/conf.py'
 repos:
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v5.0.0
+  rev: v6.0.0
   hooks:
   - id: trailing-whitespace
   - id: check-added-large-files
@@ -19,7 +19,7 @@ repos:
 - repo: https://github.com/astral-sh/ruff-pre-commit
   # Ruff version.
-  rev: v0.11.5
+  rev: v0.14.3
   hooks:
     - id: ruff
       args: [--fix, --exit-non-zero-on-fix]

{cellarr_array-0.1.0 → cellarr_array-0.3.1}/CHANGELOG.md RENAMED Viewed

@@ -1,5 +1,19 @@
 # Changelog
+## Version 0.3.0 - 0.3.1
+- Support for string dimensions when creating cellarr arrays.
+- Support query conditions for slice operations.
+- Added unique dim values. Only supported for sparse arrays.
+- Fix a minor bug causing memory leaks on large sparse arrays.
+- EOL for Python 3.9
+## Version 0.2.0
+- Dataloaders for sparse and dense arrays, We provide templates for both map and Iterable style dataloaders. Users are expected the caveats of both of these approaches.
+- Fixed a bug with slicing on 1D arrays and many improvements for optimizing slicing parameters.
+- Update documentation and tests.
 ## Version 0.1.0
 - Support cellarr-arrays on user provided tiledb array objects.

{cellarr_array-0.1.0 → cellarr_array-0.3.1}/PKG-INFO RENAMED Viewed

@@ -1,12 +1,13 @@
 Metadata-Version: 2.4
 Name: cellarr-array
-Version: 0.1.0
+Version: 0.3.1
 Summary: Base class for handling TileDB backed arrays.
 Home-page: https://github.com/cellarr/cellarr-array
 Author: Jayaram Kancherla
 Author-email: jayaram.kancherla@gmail.com
 License: MIT
 Project-URL: Documentation, https://github.com/cellarr/cellarr-array
+Project-URL: Source, https://github.com/cellarr/cellarr-array
 Platform: any
 Classifier: Development Status :: 4 - Beta
 Classifier: Programming Language :: Python
@@ -16,10 +17,14 @@ Requires-Dist: importlib-metadata; python_version < "3.8"
 Requires-Dist: tiledb
 Requires-Dist: numpy
 Requires-Dist: scipy
+Provides-Extra: optional
+Requires-Dist: torch; extra == "optional"
 Provides-Extra: testing
 Requires-Dist: setuptools; extra == "testing"
 Requires-Dist: pytest; extra == "testing"
 Requires-Dist: pytest-cov; extra == "testing"
+Requires-Dist: pandas; extra == "testing"
+Requires-Dist: torch; extra == "testing"
 Dynamic: license-file
 [![PyPI-Server](https://img.shields.io/pypi/v/cellarr-array.svg)](https://pypi.org/project/cellarr-array/)

{cellarr_array-0.1.0 → cellarr_array-0.3.1}/setup.cfg RENAMED Viewed

@@ -10,6 +10,7 @@ long_description_content_type = text/markdown; charset=UTF-8; variant=GFM
 url = https://github.com/cellarr/cellarr-array
 project_urls =
 	Documentation = https://github.com/cellarr/cellarr-array
+	Source = https://github.com/cellarr/cellarr-array
 platforms = any
 classifiers =
 	Development Status :: 4 - Beta
@@ -33,10 +34,14 @@ exclude =
 	tests
 [options.extras_require]
+optional =
+	torch
 testing =
 	setuptools
 	pytest
 	pytest-cov
+	pandas
+	%(optional)s
 [options.entry_points]

{cellarr_array-0.1.0 → cellarr_array-0.3.1}/src/cellarr_array/__init__.py RENAMED Viewed

@@ -15,7 +15,6 @@ except PackageNotFoundError:  # pragma: no cover
 finally:
     del version, PackageNotFoundError
-from .config import CellArrConfig, ConsolidationConfig
-from .cellarray_dense import DenseCellArray
-from .cellarray_sparse import SparseCellArray
-from .helpers import create_cellarray, SliceHelper
+from .core import DenseCellArray, SparseCellArray
+from .core.helpers import create_cellarray
+from .utils import CellArrConfig, ConsolidationConfig

cellarr_array-0.3.1/src/cellarr_array/core/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .base import CellArray
+from .dense import DenseCellArray
+from .sparse import SparseCellArray

cellarr_array-0.1.0/src/cellarr_array/cellarray_base.py → cellarr_array-0.3.1/src/cellarr_array/core/base.py RENAMED Viewed

@@ -12,7 +12,7 @@ import numpy as np
 import tiledb
 from scipy import sparse
-from .config import ConsolidationConfig
+from ..utils.config import ConsolidationConfig
 from .helpers import SliceHelper
 __author__ = "Jayaram Kancherla"
@@ -119,6 +119,7 @@ class CellArray(ABC):
         self._shape = None
         self._ndim = None
         self._dim_names = None
+        self._dim_dtypes = None
         self._attr_names = None
         self._nonempty_domain = None
@@ -185,7 +186,16 @@ class CellArray(ABC):
     def shape(self) -> Tuple[int, ...]:
         if self._shape is None:
             with self.open_array(mode="r") as A:
-                self._shape = tuple(int(dim.domain[1] - dim.domain[0] + 1) for dim in A.schema.domain)
+                shape_list = []
+                for dim in A.schema.domain:
+                    try:
+                        # This will fail for string dimensions
+                        shape_list.append(dim.shape[0])
+                    except TypeError:
+                        # For string dimensions, the shape is not well-defined.
+                        # We use a large number as a placeholder for slicing purposes.
+                        shape_list.append(2**63 - 1)
+                self._shape = tuple(shape_list)
         return self._shape
     @property
@@ -209,6 +219,14 @@ class CellArray(ABC):
                 # self._ndim = len(self.shape)
         return self._ndim
+    @property
+    def dim_dtypes(self) -> List[np.dtype]:
+        """Get dimension dtypes of the array."""
+        if self._dim_dtypes is None:
+            with self.open_array(mode="r") as A:
+                self._dim_dtypes = [dim.dtype for dim in A.schema.domain]
+        return self._dim_dtypes
     @contextmanager
     def open_array(self, mode: Optional[str] = None):
         """Context manager for array operations.
@@ -235,8 +253,8 @@ class CellArray(ABC):
                     ) from e
             effective_mode = mode if mode is not None else self._opened_array_external.mode
             current_external_mode = self._opened_array_external.mode
             if effective_mode == "r" and current_external_mode not in ["r", "w", "m"]:
                 # Read ops ok on write/modify modes
                 pass
@@ -259,35 +277,54 @@ class CellArray(ABC):
             finally:
                 array.close()
-    def __getitem__(self, key: Union[slice, EllipsisType, Tuple[Union[slice, List[int]], ...], EllipsisType]):
-        """Get item implementation that routes to either direct slicing or multi_index
-        based on the type of indices provided.
+    def __getitem__(self, key: Union[slice, EllipsisType, Tuple[Union[slice, List[int]], ...], EllipsisType, str]):
+        """Get item implementation that routes to either direct slicing, multi_index,
+        or query based on the type of indices provided.
         Args:
             key:
                 Slice or list of indices for each dimension in the array.
+                Alternatively, may be string to specify query conditions.
         """
+        # This is a query condition
+        if isinstance(key, str):
+            with self.open_array(mode="r") as array:
+                if self._attr is not None:
+                    return array.query(cond=key, attrs=[self._attr])[:]
+                else:
+                    array.query(cond=key)[:]
         if not isinstance(key, tuple):
             key = (key,)
-        if len(key) > self.ndim:
+        num_ellipsis = sum(isinstance(i, EllipsisType) for i in key)
+        if num_ellipsis > 1:
+            raise IndexError("an index can only have a single ellipsis ('...')")
+        if num_ellipsis == 1:
+            ellipsis_idx = key.index(Ellipsis)
+            num_other_indices = len(key) - 1
+            num_slices_to_add = self.ndim - num_other_indices
+            key = key[:ellipsis_idx] + (slice(None),) * num_slices_to_add + key[ellipsis_idx + 1 :]
+        if len(key) < self.ndim:
+            key = key + (slice(None),) * (self.ndim - len(key))
+        elif len(key) > self.ndim:
             raise IndexError(f"Invalid number of dimensions: got {len(key)}, expected {self.ndim}")
         # Normalize all indices
-        normalized_key = tuple(SliceHelper.normalize_index(idx, self.shape[i]) for i, idx in enumerate(key))
-        num_ellipsis = sum(isinstance(i, EllipsisType) for i in normalized_key)
-        if num_ellipsis > 1:
-            raise IndexError(f"Found more than 1 Ellipsis (...) in key: {normalized_key}")
+        normalized_key = tuple(
+            SliceHelper.normalize_index(idx, self.shape[i], self.dim_dtypes[i]) for i, idx in enumerate(key)
+        )
         # Check if we can use direct slicing
-        use_direct = all(isinstance(idx, (slice, EllipsisType)) for idx in normalized_key)
+        use_direct = all(isinstance(idx, slice) for idx in normalized_key)
         if use_direct:
             return self._direct_slice(normalized_key)
         else:
-            if num_ellipsis > 0:
-                raise IndexError(f"tiledb does not support ellipsis in multi-index access: {normalized_key}")
             return self._multi_index(normalized_key)
     @abstractmethod
@@ -342,3 +379,17 @@ class CellArray(ABC):
                 Additional arguments for write operation.
         """
         pass
+    def get_unique_dim_values(self, dim_name: Optional[str] = None) -> np.ndarray:
+        """Get unique values for a dimension.
+        Args:
+            dim_name:
+                The name of the dimension. If None, unique values for all
+                dimensions are returned.
+        Returns:
+            An array of unique dimension values.
+        """
+        with self.open_array(mode="r") as A:
+            return A.unique_dim_values(dim_name)

cellarr_array-0.1.0/src/cellarr_array/cellarray_dense.py → cellarr_array-0.3.1/src/cellarr_array/core/dense.py RENAMED Viewed

@@ -7,7 +7,7 @@ from typing import List, Tuple, Union
 import numpy as np
-from .cellarray_base import CellArray
+from .base import CellArray
 from .helpers import SliceHelper
 __author__ = "Jayaram Kancherla"
@@ -92,7 +92,6 @@ class DenseCellArray(CellArray):
         if len(data.shape) != self.ndim:
             raise ValueError(f"Data dimensions {data.shape} don't match array dimensions {self.shape}.")
-        # Check bounds
         end_row = start_row + data.shape[0]
         if end_row > self.shape[0]:
             raise ValueError(
@@ -102,7 +101,6 @@ class DenseCellArray(CellArray):
         if self.ndim == 2 and data.shape[1] != self.shape[1]:
             raise ValueError(f"Data columns {data.shape[1]} don't match array columns {self.shape[1]}.")
-        # Construct write region
         if self.ndim == 1:
             write_region = slice(start_row, end_row)
         else:  # 2D
@@ -110,4 +108,5 @@ class DenseCellArray(CellArray):
         # write_data = {self._attr: data} if len(self.attr_names) > 1 else data
         with self.open_array(mode="w") as array:
+            print("write_region", write_region)
             array[write_region] = data

{cellarr_array-0.1.0/src/cellarr_array → cellarr_array-0.3.1/src/cellarr_array/core}/helpers.py RENAMED Viewed

@@ -8,7 +8,7 @@ from typing import List, Optional, Tuple, Union
 import numpy as np
 import tiledb
-from .config import CellArrConfig
+from ..utils.config import CellArrConfig
 __author__ = "Jayaram Kancherla"
 __copyright__ = "Jayaram Kancherla"
@@ -52,7 +52,7 @@ def create_cellarray(
             Optional list of dimension names.
         dim_dtypes:
-            Optional list of dimension dtypes.
+            Optional list of dimension dtypes. Defaults to numpy's uint32.
         attr_name:
             Name of the data attribute.
@@ -67,29 +67,28 @@ def create_cellarray(
         ValueError: If dimensions are invalid or inputs are inconsistent.
     """
     config = config or CellArrConfig()
+    tiledb_ctx = tiledb.Config(config.ctx_config) if config.ctx_config else None
     if attr_dtype is None:
         attr_dtype = np.float32
     if isinstance(attr_dtype, str):
         attr_dtype = np.dtype(attr_dtype)
-    # Require either shape or dim_dtypes
     if shape is None and dim_dtypes is None:
         raise ValueError("Either 'shape' or 'dim_dtypes' must be provided.")
     if shape is not None:
         if len(shape) not in (1, 2):
-            raise ValueError("Only 1D and 2D arrays are supported.")
+            raise ValueError("Shape must have 1 or 2 dimensions.")
     # Set dimension dtypes, defaults to numpy uint32
     if dim_dtypes is None:
         dim_dtypes = [np.uint32] * len(shape)
     else:
         if len(dim_dtypes) not in (1, 2):
-            raise ValueError("Only 1D and 2D arrays are supported.")
+            raise ValueError("Array must have 1 or 2 dimensions.")
         dim_dtypes = [np.dtype(dt) if isinstance(dt, str) else dt for dt in dim_dtypes]
-    # Calculate shape from dtypes if needed
     if shape is None:
         shape = tuple(np.iinfo(dt).max if np.issubdtype(dt, np.integer) else None for dt in dim_dtypes)
     if None in shape:
@@ -97,7 +96,6 @@ def create_cellarray(
             np.iinfo(dt).max if s is None and np.issubdtype(dt, np.integer) else s for s, dt in zip(shape, dim_dtypes)
         )
-    # Set dimension names
     if dim_names is None:
         dim_names = [f"dim_{i}" for i in range(len(shape))]
@@ -105,42 +103,52 @@ def create_cellarray(
     if not (len(shape) == len(dim_dtypes) == len(dim_names)):
         raise ValueError("Lengths of 'shape', 'dim_dtypes', and 'dim_names' must match.")
-    dom = tiledb.Domain(
-        *[
-            tiledb.Dim(name=name, domain=(0, s - 1), tile=min(s, config.tile_capacity), dtype=dt)
-            for name, s, dt in zip(dim_names, shape, dim_dtypes)
-        ],
-        ctx=tiledb.Ctx(config.ctx_config),
-    )
+    dims = []
+    for name, s, dt in zip(dim_names, shape, dim_dtypes):
+        if np.issubdtype(dt, np.integer):
+            domain = (0, 0 if s == 0 else s - 1)
+            tile = min(1 if s == 0 else s // 2, config.tile_capacity // 2)
+            dim_dtype = dt
+        else:  # Assumes string or object dtype
+            domain = (None, None)
+            tile = None
+            dim_dtype = "ascii"
+        dims.append(
+            tiledb.Dim(
+                name=name,
+                domain=domain,
+                tile=tile,
+                dtype=dim_dtype,
+                filters=config.coords_filters,
+            )
+        )
-    attr = tiledb.Attr(
+    dom = tiledb.Domain(*dims, ctx=tiledb_ctx)
+    attr_obj = tiledb.Attr(
         name=attr_name,
         dtype=attr_dtype,
         filters=config.attrs_filters.get(attr_name, config.attrs_filters.get("", None)),
+        ctx=tiledb_ctx,
     )
     schema = tiledb.ArraySchema(
         domain=dom,
-        attrs=[attr],
+        attrs=[attr_obj],
         cell_order=config.cell_order,
         tile_order=config.tile_order,
         sparse=sparse,
-        coords_filters=config.coords_filters,
         offsets_filters=config.offsets_filters,
-        ctx=tiledb.Ctx(config.ctx_config),
+        ctx=tiledb_ctx,
     )
+    tiledb.Array.create(uri, schema, ctx=tiledb_ctx)
-    tiledb.Array.create(uri, schema)
+    from .dense import DenseCellArray
+    from .sparse import SparseCellArray
-    # Import here to avoid circular imports
-    from .cellarray_dense import DenseCellArray
-    from .cellarray_sparse import SparseCellArray
-    # Return appropriate array type
     return (
-        SparseCellArray(uri=uri, attr=attr_name, mode=mode)
+        SparseCellArray(uri=uri, attr=attr_name, mode=mode, config_or_context=tiledb_ctx)
         if sparse
-        else DenseCellArray(uri=uri, attr=attr_name, mode=mode)
+        else DenseCellArray(uri=uri, attr=attr_name, mode=mode, config_or_context=tiledb_ctx)
     )
@@ -149,58 +157,101 @@ class SliceHelper:
     @staticmethod
     def is_contiguous_indices(indices: List[int]) -> Optional[slice]:
-        """Check if indices can be represented as a contiguous slice."""
+        """Checks if a list of indices is contiguous and can be converted to a slice.
+        Returns None if the list is not contiguous or contains non-integers.
+        """
         if not indices:
             return None
-        diffs = np.diff(indices)
+        if not all(isinstance(i, (int, np.integer)) for i in indices):
+            return None
+        sorted_indices = sorted(list(set(indices)))
+        if not sorted_indices:
+            return None
+        if len(sorted_indices) == 1:
+            return slice(sorted_indices[0], sorted_indices[0] + 1, None)
+        diffs = np.diff(sorted_indices)
         if np.all(diffs == 1):
-            return slice(indices[0], indices[-1] + 1, None)
+            return slice(sorted_indices[0], sorted_indices[-1] + 1, None)
         return None
     @staticmethod
-    def normalize_index(idx: Union[int, slice, List[int]], dim_size: int) -> Union[slice, List[int], EllipsisType]:
+    def normalize_index(
+        idx: Union[int, range, slice, List, str, EllipsisType],
+        dim_size: int,
+        dim_dtype: np.dtype,
+    ) -> Union[slice, List, EllipsisType]:
         """Normalize index to handle negative indices and ensure consistency."""
+        is_string_dim = np.issubdtype(dim_dtype, np.str_) or np.issubdtype(dim_dtype, np.bytes_)
+        if is_string_dim:
+            if isinstance(idx, (str, bytes)):
+                return [idx]
+            if isinstance(idx, list) and all(isinstance(i, (str, bytes)) for i in idx):
+                return idx
+            if isinstance(idx, slice):
+                # For string dimensions, we do not normalize the slice with integer sizes
+                return idx
+            raise TypeError(f"Unsupported index type '{type(idx).__name__}' for string dimension.")
         if isinstance(idx, EllipsisType):
             return idx
-        # Convert ranges to slices
         if isinstance(idx, range):
             idx = slice(idx.start, idx.stop, idx.step)
         if isinstance(idx, slice):
-            start = idx.start if idx.start is not None else 0
-            stop = idx.stop if idx.stop is not None else dim_size
-            step = idx.step
+            start, stop, step = idx.start, idx.stop, idx.step
+            # Resolve None to full dimension slice parts
+            if start is None:
+                start = 0
+            if stop is None:
+                stop = dim_size
             # Handle negative indices
             if start < 0:
-                start = dim_size + start
+                start += dim_size
             if stop < 0:
-                stop = dim_size + stop
-            if start < 0 or start > dim_size:
-                raise IndexError(f"Start index {start} out of bounds for dimension size {dim_size}")
-            if stop < 0 or stop > dim_size:
-                raise IndexError(f"Stop index {stop} out of bounds for dimension size {dim_size}")
+                stop += dim_size
+            # Clamping slice arguments to dimensions
+            stop = min(stop, dim_size)
+            start = max(0, start)
             return slice(start, stop, step)
-        elif isinstance(idx, list):
-            norm_idx = [i if i >= 0 else dim_size + i for i in idx]
-            if any(i < 0 or i >= dim_size for i in norm_idx):
-                raise IndexError(f"List indices {idx} out of bounds for dimension size {dim_size}")
-            return norm_idx
+        if isinstance(idx, list):
+            if not idx:
+                return []
-        else:  # Single integer index
-            norm_idx = idx if idx >= 0 else dim_size + idx
+            # This check only applies to integer lists
+            if not all(isinstance(i, (int, np.integer)) for i in idx):
+                raise TypeError(
+                    "List indices must be all integers or all strings, but got mixed types or non-string/int types."
+                )
-            if norm_idx < 0 or norm_idx >= dim_size:
-                raise IndexError(f"Index {idx} out of bounds for dimension size {dim_size}")
+            norm_idx = [i if i >= 0 else dim_size + i for i in idx]
+            if any(i < 0 or i >= dim_size for i in norm_idx):
+                raise IndexError("List indices out of bounds for dimension size.")
+            return sorted(list(set(norm_idx)))
+        if isinstance(idx, (int, np.integer)):
+            norm_idx = int(idx)
+            if norm_idx < 0:
+                norm_idx += dim_size
+            if not (0 <= norm_idx < dim_size):
+                raise IndexError(f"Index {idx} out of bounds for dimension size.")
             return slice(norm_idx, norm_idx + 1, None)
+        raise TypeError(f"Index type {type(idx)} not supported for normalization.")
 def create_group(output_path, group_name):
     tiledb.group_create(f"{output_path}/{group_name}")

cellarr-array 0.1.0__tar.gz → 0.3.1__tar.gz

cellarr-array 0.1.0tar.gz → 0.3.1tar.gz