PyPI - cellarr-array - Versions diffs - 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

cellarr-array 0.1.0py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cellarr-array might be problematic. Click here for more details.

Files changed (20) hide show

cellarr_array/__init__.py +3 -4
cellarr_array/core/__init__.py +3 -0
cellarr_array/{cellarray_base.py → core/base.py} +50 -3
cellarr_array/{cellarray_dense.py → core/dense.py} +2 -3
cellarr_array/{helpers.py → core/helpers.py} +101 -51
cellarr_array/{cellarray_sparse.py → core/sparse.py} +11 -17
cellarr_array/dataloaders/__init__.py +3 -0
cellarr_array/dataloaders/denseloader.py +198 -0
cellarr_array/dataloaders/iterabledataloader.py +320 -0
cellarr_array/dataloaders/sparseloader.py +230 -0
cellarr_array/dataloaders/utils.py +26 -0
cellarr_array/utils/__init__.py +3 -0
cellarr_array/utils/mock.py +167 -0
{cellarr_array-0.1.0.dist-info → cellarr_array-0.3.0.dist-info}/METADATA +6 -1
cellarr_array-0.3.0.dist-info/RECORD +19 -0
{cellarr_array-0.1.0.dist-info → cellarr_array-0.3.0.dist-info}/WHEEL +1 -1
cellarr_array-0.1.0.dist-info/RECORD +0 -11
/cellarr_array/{config.py → utils/config.py} +0 -0
{cellarr_array-0.1.0.dist-info → cellarr_array-0.3.0.dist-info}/licenses/LICENSE.txt +0 -0
{cellarr_array-0.1.0.dist-info → cellarr_array-0.3.0.dist-info}/top_level.txt +0 -0

cellarr_array/dataloaders/sparseloader.py ADDED Viewed

@@ -0,0 +1,230 @@
+from typing import Optional
+from warnings import warn
+import scipy.sparse as sp
+import tiledb
+import torch
+from torch.utils.data import DataLoader, Dataset
+from ..core.sparse import SparseCellArray
+__author__ = "Jayaram Kancherla"
+__copyright__ = "Jayaram Kancherla"
+__license__ = "MIT"
+class SparseArrayDataset(Dataset):
+    def __init__(
+        self,
+        array_uri: str,
+        attribute_name: str = "data",
+        num_rows: Optional[int] = None,
+        num_columns: Optional[int] = None,
+        sparse_format=sp.csr_matrix,
+        cellarr_ctx_config: Optional[dict] = None,
+        transform=None,
+    ):
+        """PyTorch Dataset for sparse TileDB arrays accessed via SparseCellArray.
+        Args:
+            array_uri:
+                URI of the TileDB sparse array.
+            attribute_name:
+                Name of the attribute to read from.
+            num_rows:
+                Total number of rows in the dataset.
+                If None, will infer from `array.shape[0]`.
+            num_columns:
+                The number of columns in the dataset.
+                If None, will attempt to infer `from array.shape[1]`.
+            sparse_format:
+                Format to return, defaults to csr_matrix.
+            cellarr_ctx_config:
+                Optional TileDB context configuration dict for CellArray.
+            transform:
+                Optional transform to be applied on a sample.
+        """
+        self.array_uri = array_uri
+        self.attribute_name = attribute_name
+        self.sparse_format = sparse_format
+        self.cellarr_ctx_config = cellarr_ctx_config
+        self.transform = transform
+        self.cell_array_instance = None
+        if num_rows is not None and num_columns is not None:
+            self._len = num_rows
+            self.num_columns = num_columns
+        else:
+            print(f"Dataset '{array_uri}': num_rows or num_columns not provided. Probing sparse array...")
+            init_ctx_config = tiledb.Config(self.cellarr_ctx_config) if self.cellarr_ctx_config else None
+            try:
+                temp_arr = SparseCellArray(
+                    uri=self.array_uri,
+                    attr=self.attribute_name,
+                    config_or_context=init_ctx_config,
+                    return_sparse=True,
+                    sparse_format=self.sparse_format,
+                )
+                if temp_arr.ndim == 1:
+                    self._len = num_rows if num_rows is not None else temp_arr.shape[0]
+                    self.num_columns = 1
+                elif temp_arr.ndim == 2:
+                    self._len = num_rows if num_rows is not None else temp_arr.shape[0]
+                    self.num_columns = num_columns if num_columns is not None else temp_arr.shape[1]
+                else:
+                    raise ValueError(f"Array ndim {temp_arr.ndim} not supported.")
+                print(f"Dataset '{array_uri}': Inferred sparse shape. Rows: {self._len}, Columns: {self.num_columns}")
+            except Exception as e:
+                if num_rows is None or num_columns is None:
+                    raise ValueError(
+                        f"num_rows and num_columns must be provided if inferring sparse array shape fails for '{array_uri}'. Original error: {e}"
+                    ) from e
+                self._len = num_rows if num_rows is not None else 0
+                self.num_columns = num_columns if num_columns is not None else 0
+                warn(
+                    f"Falling back to provided or zero dimensions for sparse '{array_uri}' due to inference error: {e}",
+                    RuntimeWarning,
+                )
+        if self.num_columns is None or self.num_columns <= 0 and self._len > 0:
+            raise ValueError(
+                f"num_columns ({self.num_columns}) is invalid or could not be determined for sparse array '{array_uri}'."
+            )
+        if self._len == 0:
+            warn(f"SparseDataset for '{array_uri}' has length 0.", RuntimeWarning)
+    def _init_worker_state(self):
+        if self.cell_array_instance is None:
+            ctx = tiledb.Ctx(self.cellarr_ctx_config) if self.cellarr_ctx_config else None
+            self.cell_array_instance = SparseCellArray(
+                uri=self.array_uri,
+                attr=self.attribute_name,
+                mode="r",
+                config_or_context=ctx,
+                return_sparse=True,
+                sparse_coerce=self.sparse_format,
+            )
+    def __len__(self):
+        return self._len
+    def __getitem__(self, idx):
+        if not 0 <= idx < self._len:
+            raise IndexError(f"Index {idx} out of bounds for dataset of length {self._len}.")
+        self._init_worker_state()
+        item_slice = (slice(idx, idx + 1), slice(None))
+        scipy_sparse_sample = self.cell_array_instance[item_slice]
+        if self.transform:  # e.g., convert to COO for easier collation
+            scipy_sparse_sample = self.transform(scipy_sparse_sample)
+        if not isinstance(scipy_sparse_sample, sp.coo_matrix):
+            scipy_sparse_sample = scipy_sparse_sample.tocoo()
+        return scipy_sparse_sample
+def sparse_coo_collate_fn(batch):
+    """Custom collate_fn for a batch of SciPy COO sparse matrices.
+    Converts them into a single batched PyTorch sparse COO tensor.
+    Each item in 'batch' is a SciPy coo_matrix representing one sample.
+    """
+    all_data = []
+    all_row_indices = []
+    all_col_indices = []
+    for i, scipy_coo in enumerate(batch):
+        if scipy_coo.nnz > 0:
+            all_data.append(torch.from_numpy(scipy_coo.data))
+            all_row_indices.append(torch.full_like(torch.from_numpy(scipy_coo.row), fill_value=i, dtype=torch.long))
+            all_col_indices.append(torch.from_numpy(scipy_coo.col))
+    if not all_data:
+        num_columns = batch[0].shape[1] if batch else 0
+        return torch.sparse_coo_tensor(torch.empty((2, 0), dtype=torch.long), torch.empty(0), (len(batch), num_columns))
+    data_cat = torch.cat(all_data)
+    row_indices_cat = torch.cat(all_row_indices)
+    col_indices_cat = torch.cat(all_col_indices)
+    indices = torch.stack([row_indices_cat, col_indices_cat], dim=0)
+    num_columns = batch[0].shape[1]
+    batch_size = len(batch)
+    sparse_tensor = torch.sparse_coo_tensor(indices, data_cat, (batch_size, num_columns))
+    return sparse_tensor
+def construct_sparse_array_dataloader(
+    array_uri: str,
+    attribute_name: str = "data",
+    num_rows: Optional[int] = None,
+    num_columns: Optional[int] = None,
+    batch_size: int = 1000,
+    num_workers_dl: int = 2,
+) -> DataLoader:
+    """Construct an instance of `SparseArrayDataset` with PyTorch DataLoader.
+    Args:
+        array_uri:
+            URI of the TileDB array.
+        attribute_name:
+            Name of the attribute to read from.
+        num_rows:
+            The total number of rows in the TileDB array.
+        num_columns:
+            The total number of columns in the TileDB array.
+        batch_size:
+            Number of random samples per batch generated by the dataset.
+        num_workers_dl:
+            Number of worker processes for the DataLoader.
+    """
+    tiledb_ctx_config = {
+        "sm.tile_cache_size": 1000 * 1024**2,
+        "sm.num_reader_threads": 4,
+    }
+    dataset = SparseArrayDataset(
+        array_uri=array_uri,
+        attribute_name=attribute_name,
+        num_rows=num_rows,
+        num_columns=num_columns,
+        sparse_format=sp.coo_matrix,
+        cellarr_ctx_config=tiledb_ctx_config,
+    )
+    if len(dataset) == 0:
+        print("Dataset is empty, cannot create DataLoader.")
+        return
+    dataloader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=True,
+        num_workers=num_workers_dl,
+        collate_fn=sparse_coo_collate_fn,
+        pin_memory=False,
+        persistent_workers=True if num_workers_dl > 0 else False,
+    )
+    return dataloader

cellarr_array/dataloaders/utils.py ADDED Viewed

@@ -0,0 +1,26 @@
+import random
+import numpy as np
+__author__ = "Jayaram Kancherla"
+__copyright__ = "Jayaram Kancherla"
+__license__ = "MIT"
+def seed_worker(worker_id: int):
+    """Generate seeds for a PyTorch DataLoader worker.
+    This ensures that if multiple workers are sampling randomly, they use
+    different sequences of random numbers.
+    Args:
+        worker_id:
+            The ID of the worker process.
+    """
+    import torch
+    worker_seed = torch.initial_seed() % 2**32
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
+    # print(f"Worker {worker_id} seeded with {worker_seed}")

cellarr_array/utils/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .config import CellArrConfig, ConsolidationConfig
+from ..core.helpers import create_cellarray
+# from .mock import generate_tiledb_dense_array, generate_tiledb_sparse_array

cellarr_array/utils/mock.py ADDED Viewed

@@ -0,0 +1,167 @@
+import shutil
+from typing import Dict, Optional
+import numpy as np
+import scipy.sparse as sp
+import tiledb
+from ..core import DenseCellArray, SparseCellArray
+from ..core.helpers import CellArrConfig, create_cellarray
+__author__ = "Jayaram Kancherla"
+__copyright__ = "Jayaram Kancherla"
+__license__ = "MIT"
+def generate_tiledb_dense_array(
+    uri: str,
+    rows: int,
+    cols: int,
+    attr_name: str = "data",
+    attr_dtype: np.dtype = np.float32,
+    chunk_size: int = 1000,
+    tiledb_config: Optional[Dict] = None,
+):
+    """Generates a dense TileDB array and fills it with random float32 data.
+    Args:
+        uri:
+            URI for the new TileDB array.
+        rows:
+            Number of rows.
+        cols:
+            Number of columns (features).
+        attr_name:
+            Name of the attribute.
+        attr_dtype:
+            Data type of the attribute.
+        chunk_size:
+            Number of rows to write per batch.
+        tiledb_config:
+            TileDB context configuration.
+    """
+    if tiledb.array_exists(uri):
+        print(f"Array {uri} already exists. Removing.")
+        shutil.rmtree(uri)
+    print(f"Creating dense array at '{uri}' with shape ({rows}, {cols})")
+    cfg = CellArrConfig(ctx_config=tiledb_config if tiledb_config else {})
+    create_cellarray(
+        uri=uri,
+        shape=(rows, cols),
+        attr_dtype=attr_dtype,
+        sparse=False,
+        dim_names=["rows", "cols"],
+        attr_name=attr_name,
+        # config=cfg
+    )
+    ctx = tiledb.Ctx(cfg.ctx_config) if cfg.ctx_config else None
+    arr_writer = DenseCellArray(uri=uri, attr=attr_name, mode="w", config_or_context=ctx)
+    print("shape of writer", arr_writer.shape)
+    print(f"Writing data to dense array '{uri}'...")
+    for i in range(0, rows, chunk_size):
+        end_row = min(i + chunk_size, rows)
+        num_chunk_rows = end_row - i
+        data_chunk = np.random.rand(num_chunk_rows, cols).astype(attr_dtype)
+        print(i, end_row, num_chunk_rows, data_chunk.shape)
+        arr_writer.write_batch(data_chunk, start_row=i)
+        if (i // chunk_size) % 10 == 0:
+            print(f"  Dense write: {end_row}/{rows} rows written.")
+    print(f"Finished writing to dense array '{uri}'.")
+def generate_tiledb_sparse_array(
+    uri: str,
+    rows: int,
+    cols: int,
+    density: float = 0.01,
+    attr_name: str = "data",
+    attr_dtype: np.dtype = np.float32,
+    chunk_size: int = 1000,
+    tiledb_config: Optional[Dict] = None,
+    sparse_format_to_write="coo",
+):
+    """Generates a sparse TileDB array and fills it with random float32 data.
+    Args:
+        uri:
+            URI for the new TileDB array.
+        rows:
+            Number of rows.
+        cols:
+            Number of columns (features).
+        density:
+            Density of the sparse matrix.
+        attr_name:
+            Name of the attribute.
+        attr_dtype:
+            Data type of the attribute.
+        chunk_size:
+            Number of rows to generate and write per batch.
+        tiledb_configs:
+            TileDB context configuration.
+        sparse_format_to_write:
+            Scipy sparse format to use for generating chunks ('coo', 'csr', 'csc').
+    """
+    if tiledb.array_exists(uri):
+        print(f"Array {uri} already exists. Removing.")
+        shutil.rmtree(uri)
+    print(f"Creating sparse array at '{uri}' with shape ({rows}, {cols}), density ~{density}")
+    cfg = CellArrConfig(ctx_config=tiledb_config if tiledb_config else {})
+    create_cellarray(
+        uri=uri,
+        shape=(rows, cols),
+        attr_dtype=attr_dtype,
+        sparse=True,
+        dim_names=["rows", "cols"],
+        attr_name=attr_name,
+        # config=cfg
+    )
+    ctx = tiledb.Ctx(cfg.ctx_config) if cfg.ctx_config else None
+    arr_writer = SparseCellArray(
+        uri=uri,
+        attr=attr_name,
+        mode="w",
+        config_or_context=ctx,
+    )
+    print(f"Writing data to sparse array '{uri}'...")
+    for i in range(0, rows, chunk_size):
+        end_row = min(i + chunk_size, rows)
+        num_chunk_rows = end_row - i
+        if num_chunk_rows <= 0:
+            continue
+        data_chunk_scipy = sp.random(
+            num_chunk_rows, cols, density=density, format=sparse_format_to_write, dtype=attr_dtype
+        )
+        if data_chunk_scipy.nnz > 0:
+            arr_writer.write_batch(data_chunk_scipy, start_row=i)
+        if (i // chunk_size) % 10 == 0:
+            print(f"  Sparse write: {end_row}/{rows} rows processed for writing.")
+    print(f"Finished writing to sparse array '{uri}'.")

{cellarr_array-0.1.0.dist-info → cellarr_array-0.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,12 +1,13 @@
 Metadata-Version: 2.4
 Name: cellarr-array
-Version: 0.1.0
+Version: 0.3.0
 Summary: Base class for handling TileDB backed arrays.
 Home-page: https://github.com/cellarr/cellarr-array
 Author: Jayaram Kancherla
 Author-email: jayaram.kancherla@gmail.com
 License: MIT
 Project-URL: Documentation, https://github.com/cellarr/cellarr-array
+Project-URL: Source, https://github.com/cellarr/cellarr-array
 Platform: any
 Classifier: Development Status :: 4 - Beta
 Classifier: Programming Language :: Python
@@ -16,10 +17,14 @@ Requires-Dist: importlib-metadata; python_version < "3.8"
 Requires-Dist: tiledb
 Requires-Dist: numpy
 Requires-Dist: scipy
+Provides-Extra: optional
+Requires-Dist: torch; extra == "optional"
 Provides-Extra: testing
 Requires-Dist: setuptools; extra == "testing"
 Requires-Dist: pytest; extra == "testing"
 Requires-Dist: pytest-cov; extra == "testing"
+Requires-Dist: pandas; extra == "testing"
+Requires-Dist: torch; extra == "testing"
 Dynamic: license-file
 [![PyPI-Server](https://img.shields.io/pypi/v/cellarr-array.svg)](https://pypi.org/project/cellarr-array/)

cellarr_array-0.3.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,19 @@
+cellarr_array/__init__.py,sha256=vo-WXpnb83eJPitY2PgOaASHSfLqECF_UFM3YcbPTIs,732
+cellarr_array/core/__init__.py,sha256=fvM-FEiDn8TKDbHxhhzp9FXZFNovFwvIUSY6SpLQRdk,98
+cellarr_array/core/base.py,sha256=zzG76-zN2oJ7BHldT9ohQhklD-paOYnY0u7_w1DzXR8,15188
+cellarr_array/core/dense.py,sha256=LODRH4utpKs8xhT79Q2-nRiam_s68_a0qPj0unEM7rg,3940
+cellarr_array/core/helpers.py,sha256=w1yXi7eJiL9D1vzpSp_eQbTVnyKRFp5TBpq1DLj4k9U,8262
+cellarr_array/core/sparse.py,sha256=cBxdN7a-fAsQWT6Kc6EqV_5yuEiZ-mtJzSiETKsxgmA,8814
+cellarr_array/dataloaders/__init__.py,sha256=U-MfwC2K84OIXT75in41fe_wvoxjUC5Krb5zICQn_O8,245
+cellarr_array/dataloaders/denseloader.py,sha256=JYJlbuX5My64iIPW_-nlPFkNIezxL3Z3mkwInS3hH9M,7291
+cellarr_array/dataloaders/iterabledataloader.py,sha256=lR2T1YatyBlDM5Sy_75B7_8ORiWfn3cp4q48Oujwf-c,11916
+cellarr_array/dataloaders/sparseloader.py,sha256=V_eKw-Z_CNxHP8c2BN3sOuuv6RPiWBzRfW1BYLhNaQc,7962
+cellarr_array/dataloaders/utils.py,sha256=buJ87x1YBTt5-nZoy_I5j6ko1lVlHdiGpQCusdLoRLI,600
+cellarr_array/utils/__init__.py,sha256=DM5jeUMbxbRzTu2QCjpLlrTQ5uionF887S_7i6_952U,177
+cellarr_array/utils/config.py,sha256=67zBxpYY9N_v6TMdyljUIZmckbwOBcuLC99aJooGmfA,2917
+cellarr_array/utils/mock.py,sha256=7GyCbtM7u94pm7qhjsPRSO2IWYLmd4UrjyvLnQtMMkc,4579
+cellarr_array-0.3.0.dist-info/licenses/LICENSE.txt,sha256=JUlHIfWcRe_MZop18pQvMIPLKSSPz3XQ06ASHuW5Wh8,1076
+cellarr_array-0.3.0.dist-info/METADATA,sha256=J9LgoIMWYKpXNwLEyhxfsIKLtkUitF-0RACpaqzJy7c,4332
+cellarr_array-0.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+cellarr_array-0.3.0.dist-info/top_level.txt,sha256=oErp0D8ABZV-QPtTiXT8_F2z36Ic7ykuDg_1Y84HLZM,14
+cellarr_array-0.3.0.dist-info/RECORD,,

{cellarr_array-0.1.0.dist-info → cellarr_array-0.3.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.7.1)
+Generator: setuptools (80.9.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

cellarr_array-0.1.0.dist-info/RECORD DELETED Viewed

@@ -1,11 +0,0 @@
-cellarr_array/__init__.py,sha256=iCU5zmXXmTwk-VuwrTdVl5STRAL2xeYpq05fL9_bW6w,781
-cellarr_array/cellarray_base.py,sha256=CSYsA_Ra-RcwsyHzwayL-w10EhpbIC3u7ZAbyQMO6ks,13451
-cellarr_array/cellarray_dense.py,sha256=skunPy_WyOMuS_3SxcAW_gm8d5FiWeV7ZCQp4HLRUUY,3958
-cellarr_array/cellarray_sparse.py,sha256=YYZymvWGDG1c2EeOLMBPP5_u4qM8uhxyWJY6PnFWMVo,9112
-cellarr_array/config.py,sha256=67zBxpYY9N_v6TMdyljUIZmckbwOBcuLC99aJooGmfA,2917
-cellarr_array/helpers.py,sha256=eIeymmvY4KZ-cAiROo3DcYYzP39NQBj-4Nrba9rrEKQ,6491
-cellarr_array-0.1.0.dist-info/licenses/LICENSE.txt,sha256=JUlHIfWcRe_MZop18pQvMIPLKSSPz3XQ06ASHuW5Wh8,1076
-cellarr_array-0.1.0.dist-info/METADATA,sha256=ELBRCXkEyxhPeGHlA62i2QIzz7yYlLUSy7bfOe6aAdE,4120
-cellarr_array-0.1.0.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
-cellarr_array-0.1.0.dist-info/top_level.txt,sha256=oErp0D8ABZV-QPtTiXT8_F2z36Ic7ykuDg_1Y84HLZM,14
-cellarr_array-0.1.0.dist-info/RECORD,,

/cellarr_array/{config.py → utils/config.py} RENAMED Viewed

File without changes

{cellarr_array-0.1.0.dist-info → cellarr_array-0.3.0.dist-info}/licenses/LICENSE.txt RENAMED Viewed

File without changes

{cellarr_array-0.1.0.dist-info → cellarr_array-0.3.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

cellarr-array 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

Potentially problematic release.

cellarr-array 0.1.0py3-none-any.whl → 0.3.0py3-none-any.whl