PyPI - cap-anndata - Versions diffs - 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

cap-anndata 0.2.2py3-none-any.whl → 0.3.0py3-none-any.whl

Files changed (12) hide show

cap_anndata/__init__.py +1 -1
cap_anndata/backed_dict.py +34 -0
cap_anndata/cap_anndata.py +362 -49
cap_anndata/reader.py +25 -12
cap_anndata-0.3.0.dist-info/METADATA +54 -0
cap_anndata-0.3.0.dist-info/RECORD +10 -0
{cap_anndata-0.2.2.dist-info → cap_anndata-0.3.0.dist-info}/WHEEL +1 -1
cap_anndata/backed_uns.py +0 -28
cap_anndata-0.2.2.dist-info/METADATA +0 -253
cap_anndata-0.2.2.dist-info/RECORD +0 -10
{cap_anndata-0.2.2.dist-info → cap_anndata-0.3.0.dist-info}/LICENSE +0 -0
{cap_anndata-0.2.2.dist-info → cap_anndata-0.3.0.dist-info}/top_level.txt +0 -0

cap_anndata/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from .backed_df import CapAnnDataDF
-from .backed_uns import CapAnnDataUns
+from .backed_dict import CapAnnDataDict
 from .cap_anndata import CapAnnData
 from .reader import (
     read_directly,

cap_anndata/backed_dict.py ADDED Viewed

@@ -0,0 +1,34 @@
+from typing import Set, Any
+class CapAnnDataDict(dict):
+    __keys_to_remove: Set[str] = None
+    def __delitem__(self, __key: Any) -> None:
+        self.keys_to_remove.add(__key)
+        return super().__delitem__(__key)
+    def __setitem__(self, __key: Any, __value: Any) -> None:
+        if __value is not None:
+            if __key in self.keys_to_remove:
+                self.keys_to_remove.remove(__key)
+        else:
+            self.keys_to_remove.add(__key)
+        return super().__setitem__(__key, __value)
+    @property
+    def keys_to_remove(self) -> Set[str]:
+        if self.__keys_to_remove is None:
+            self.__keys_to_remove = set()
+        return self.__keys_to_remove
+    def pop(self, __key: Any, __default: Any = None) -> Any:
+        if __key in self:
+            self.keys_to_remove.add(__key)
+        return super().pop(__key, __default)
+    def popitem(self) -> Any:
+        item = super().popitem()
+        key = item[0]
+        self.keys_to_remove.add(key)
+        return item

cap_anndata/cap_anndata.py CHANGED Viewed

@@ -2,16 +2,23 @@ import logging
 import anndata as ad
 import numpy as np
 import h5py
-from typing import List, Union, Dict, Tuple, Final
-from anndata._io.specs import read_elem, write_elem
+from typing import List, Union, Any, Tuple, Final
+import scipy.sparse as ss
+from packaging import version
-from cap_anndata import CapAnnDataDF, CapAnnDataUns
+if version.parse(ad.__version__) < version.parse("0.11.0"):
+    from anndata.experimental import sparse_dataset, read_elem, write_elem
+else:
+    from anndata import sparse_dataset, read_elem, write_elem
+from cap_anndata import CapAnnDataDF, CapAnnDataDict
 logger = logging.getLogger(__name__)
-X_NOTATION = Union[h5py.Dataset, ad.experimental.CSRDataset, ad.experimental.CSCDataset]
-OBSM_NOTATION = Dict[str, X_NOTATION]
+X_NOTATION = Union[
+    h5py.Dataset, ad.experimental.CSRDataset, ad.experimental.CSCDataset, None
+]
+ARRAY_MAPPING_NOTATION = CapAnnDataDict[str, X_NOTATION]
 NotLinkedObject: Final = "__NotLinkedObject"
@@ -22,6 +29,10 @@ class BaseLayerMatrixAndDf:
         self._path_to_content = path_to_content
         self._X: X_NOTATION = None
+    @property
+    def file(self) -> h5py.File:
+        return self._file
     @property
     def X(self) -> X_NOTATION:
         if self._X is None:
@@ -35,7 +46,7 @@ class BaseLayerMatrixAndDf:
             self._X = x
         else:
             # sparse dataset
-            self._X = ad.experimental.sparse_dataset(x)
+            self._X = sparse_dataset(x)
     @property
     def shape(self) -> Tuple[int, int]:
@@ -76,6 +87,9 @@ class BaseLayerMatrixAndDf:
             # read whole df
             df = CapAnnDataDF.from_df(read_elem(h5_group), column_order=column_order)
         else:
+            if isinstance(columns, str):
+                # single column provided instead of list
+                columns = [columns]
             cols_to_read = [c for c in columns if c in column_order]
             df = CapAnnDataDF()
             df.column_order = column_order
@@ -92,7 +106,9 @@ class BaseLayerMatrixAndDf:
         return df
     def _write_elem(self, dest_key: str, elem: any, compression: str) -> None:
-        write_elem(self._file, dest_key, elem, dataset_kwargs={"compression": compression})
+        write_elem(
+            self._file, dest_key, elem, dataset_kwargs={"compression": compression}
+        )
     def _validate_cap_df(self, cap_df: CapAnnDataDF, axis: int) -> None:
         if not isinstance(cap_df, CapAnnDataDF):
@@ -110,6 +126,64 @@ class BaseLayerMatrixAndDf:
                 "AnnData object!"
             )
+    def _link_array_mapping(self, cap_dict: CapAnnDataDict, key: str) -> None:
+        """Method to update given cap_dict with backed array entities from the file."""
+        if key not in self._file.keys():
+            raise KeyError(f"The key {key} doesn't exist in the file! Ignore linking.")
+        group = self._file[key]
+        if not isinstance(group, h5py.Group):
+            raise ValueError(f"The object {key} must be a group!")
+        for array_name in group.keys():
+            array = group[array_name]
+            if isinstance(array, h5py.Dataset):
+                cap_dict[array_name] = array
+            elif isinstance(array, h5py.Group):
+                cap_dict[array_name] = sparse_dataset(array)
+            else:
+                raise ValueError(
+                    f"Can't link array in {key} due to unsupported type of object: {type(array)}"
+                )
+    def _create_new_matrix(
+        self,
+        dest: str,
+        matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
+        matrix_shape: Union[tuple[int, int], None] = None,
+        data_dtype: Union[np.dtype, None] = None,
+        format: Union[str, None] = None,  # TODO: use Enum instead of str
+        compression: str = "lzf",
+    ) -> None:
+        if matrix is not None:
+            self._write_elem(dest, matrix, compression=compression)
+        else:
+            if format == "dense":
+                group = self._file.create_dataset(
+                    name=dest,
+                    shape=matrix_shape,
+                    dtype=data_dtype,
+                    compression=compression,
+                )
+                # https://anndata.readthedocs.io/en/latest/fileformat-prose.html#dense-arrays-specification-v0-2-0
+                group.attrs["encoding-type"] = "array"
+                group.attrs["encoding-version"] = "0.2.0"
+            elif format in [
+                "csr",
+                "csc",
+            ]:  # Based on https://github.com/appier/h5sparse/blob/master/h5sparse/h5sparse.py
+                if data_dtype is None:
+                    data_dtype = np.float64
+                if matrix_shape is None:
+                    matrix_shape = (0, 0)
+                sparse_class = ss.csr_matrix if format == "csr" else ss.csc_matrix
+                data = sparse_class(matrix_shape, dtype=data_dtype)
+                self._write_elem(dest, data, compression=compression)
+            else:
+                raise NotImplementedError(
+                    f"Format must  be 'dense', 'csr' or 'csc' but {format} given!"
+                )
 class RawLayer(BaseLayerMatrixAndDf):
     def __init__(self, h5_file: h5py.File):
@@ -143,8 +217,12 @@ class CapAnnData(BaseLayerMatrixAndDf):
         self._obs: CapAnnDataDF = None
         self._var: CapAnnDataDF = None
         self._X: X_NOTATION = None
-        self._obsm: OBSM_NOTATION = None
-        self._uns: CapAnnDataUns = None
+        self._obsm: CapAnnDataDict = None
+        self._varm: CapAnnDataDict = None
+        self._layers: CapAnnDataDict = None
+        self._uns: CapAnnDataDict = None
+        self._obsp: CapAnnDataDict = None
+        self._varp: CapAnnDataDict = None
         self._raw: RawLayer = None
         self._shape: Tuple[int, int] = None
@@ -170,12 +248,6 @@ class CapAnnData(BaseLayerMatrixAndDf):
         self._validate_cap_df(cap_df, axis=1)
         self._var = cap_df
-    @property
-    def obsm(self) -> OBSM_NOTATION:
-        if self._obsm is None:
-            self._link_obsm()
-        return self._obsm
     @property
     def raw(self) -> RawLayer:
         if self._raw is None:
@@ -183,17 +255,51 @@ class CapAnnData(BaseLayerMatrixAndDf):
                 logger.warning("Can't read raw.var since raw layer doesn't exist!")
                 return
+            if len(self._file["raw"].keys()) == 0:
+                logger.warning("The raw layer is empty!")
+                return
             self._raw = RawLayer(self._file)
         return self._raw
     @property
-    def uns(self) -> CapAnnDataUns:
+    def uns(self) -> CapAnnDataDict[str, Any]:
         if self._uns is None:
-            self._uns = CapAnnDataUns(
+            self._uns = CapAnnDataDict(
                 {k: NotLinkedObject for k in self._file["uns"].keys()}
             )
         return self._uns
+    @property
+    def layers(self) -> CapAnnDataDict[str, X_NOTATION]:
+        if self._layers is None:
+            self._link_layers()
+        return self._layers
+    @property
+    def obsm(self) -> CapAnnDataDict[str, X_NOTATION]:
+        if self._obsm is None:
+            self._link_obsm()
+        return self._obsm
+    @property
+    def varm(self) -> CapAnnDataDict[str, X_NOTATION]:
+        if self._varm is None:
+            self._link_varm()
+        return self._varm
+    @property
+    def obsp(self) -> CapAnnDataDict[str, X_NOTATION]:
+        if self._obsp is None:
+            self._link_obsp()
+        return self._obsp
+    @property
+    def varp(self) -> CapAnnDataDict[str, X_NOTATION]:
+        if self._varp is None:
+            self._link_varp()
+        return self._varp
     def read_obs(self, columns: List[str] = None, reset: bool = False) -> None:
         df = self._read_df("obs", columns=columns)
         if self.obs.empty or reset:
@@ -210,12 +316,72 @@ class CapAnnData(BaseLayerMatrixAndDf):
             for col in df.columns:
                 self._var[col] = df[col]
+    def read_uns(self, keys: List[str] = None) -> None:
+        if keys is None:
+            keys = list(self.uns.keys())
+        for key in keys:
+            existing_keys = self.uns.keys()
+            if key in existing_keys:
+                source = self._file[f"uns/{key}"]
+                self.uns[key] = read_elem(source)
+    def _link_layers(self) -> None:
+        if self._layers is None:
+            self._layers = CapAnnDataDict()
+        if "layers" in self._file.keys():
+            self._link_array_mapping(cap_dict=self._layers, key="layers")
+    def _link_obsm(self) -> None:
+        key = "obsm"
+        if self._obsm is None:
+            self._obsm = CapAnnDataDict()
+        if key in self._file.keys():
+            self._link_array_mapping(cap_dict=self._obsm, key=key)
+    def _link_varm(self) -> None:
+        key = "varm"
+        if self._varm is None:
+            self._varm = CapAnnDataDict()
+        if key in self._file.keys():
+            self._link_array_mapping(cap_dict=self._varm, key=key)
+    def _link_obsp(self):
+        key = "obsp"
+        if self._obsp is None:
+            self._obsp = CapAnnDataDict()
+        if key in self._file.keys():
+            self._link_array_mapping(cap_dict=self._obsp, key=key)
+    def _link_varp(self):
+        key = "varp"
+        if self._varp is None:
+            self._varp = CapAnnDataDict()
+        if key in self._file.keys():
+            self._link_array_mapping(cap_dict=self._varp, key=key)
+    def obsm_keys(self) -> List[str]:
+        return list(self.obsm.keys())
+    def obs_keys(self) -> List[str]:
+        return self.obs.column_order.tolist()
+    def var_keys(self) -> List[str]:
+        return self.var.column_order.tolist()
     def overwrite(self, fields: List[str] = None, compression: str = "lzf") -> None:
         field_to_entity = {
             "obs": self.obs,
             "var": self.var,
             "raw.var": self.raw.var if self.raw is not None else None,
             "uns": self.uns,
+            "layers": self.layers,
+            "obsm": self.obsm,
+            "varm": self.varm,
+            "obsp": self.obsp,
+            "varp": self.varp,
         }
         if fields is None:
@@ -237,7 +403,9 @@ class CapAnnData(BaseLayerMatrixAndDf):
                 key = key.replace(".", "/") if key == "raw.var" else key
                 for col in entity.columns:
-                    self._write_elem(f"{key}/{col}", entity[col].values, compression=compression)
+                    self._write_elem(
+                        f"{key}/{col}", entity[col].values, compression=compression
+                    )
                 column_order = entity.column_order
                 if (
@@ -254,34 +422,179 @@ class CapAnnData(BaseLayerMatrixAndDf):
             for key in self.uns.keys_to_remove:
                 del self._file[f"uns/{key}"]
-    def read_uns(self, keys: List[str] = None) -> None:
-        if keys is None:
-            keys = list(self.uns.keys())
-        for key in keys:
-            existing_keys = self.uns.keys()
-            if key in existing_keys:
-                source = self._file[f"uns/{key}"]
-                self.uns[key] = read_elem(source)
-    def _link_obsm(self) -> None:
-        self._obsm = {}
-        if "obsm" in self._file.keys():
-            obsm_group = self._file["obsm"]
-            for entity_name in obsm_group.keys():
-                entity = obsm_group[entity_name]
-                if isinstance(entity, h5py.Dataset):
-                    # dense array
-                    self._obsm[entity_name] = entity
-                else:
-                    # sparse array
-                    self._obsm[entity_name] = ad.experimental.sparse_dataset(entity)
-    def obsm_keys(self) -> List[str]:
-        return list(self.obsm.keys())
-    def obs_keys(self) -> List[str]:
-        return self.obs.column_order.tolist()
-    def var_keys(self) -> List[str]:
-        return self.var.column_order.tolist()
+        for field in ["layers", "obsm", "varm", "obsp", "varp"]:
+            if field in fields:
+                for key in field_to_entity[field].keys_to_remove:
+                    del self._file[f"{field}/{key}"]
+    def create_layer(
+        self,
+        name: str,
+        matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
+        matrix_shape: Union[tuple[int, int], None] = None,
+        data_dtype: Union[np.dtype, None] = None,
+        format: Union[str, None] = None,
+        compression: str = "lzf",
+    ) -> None:
+        """
+        The empty layer will be created in the case of `matrix` is None.
+        """
+        self._create_new_matrix_in_field(
+            field="layers",
+            name=name,
+            matrix=matrix,
+            matrix_shape=matrix_shape,
+            data_dtype=data_dtype,
+            format=format,
+            compression=compression,
+        )
+        self._link_layers()
+    def create_obsm(
+        self,
+        name: str,
+        matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
+        matrix_shape: Union[tuple[int, int], None] = None,
+        data_dtype: Union[np.dtype, None] = None,
+        format: Union[str, None] = None,
+        compression: str = "lzf",
+    ) -> None:
+        self._create_new_matrix_in_field(
+            field="obsm",
+            name=name,
+            matrix=matrix,
+            matrix_shape=matrix_shape,
+            data_dtype=data_dtype,
+            format=format,
+            compression=compression,
+        )
+        self._link_obsm()
+    def create_varm(
+        self,
+        name: str,
+        matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
+        matrix_shape: Union[tuple[int, int], None] = None,
+        data_dtype: Union[np.dtype, None] = None,
+        format: Union[str, None] = None,
+        compression: str = "lzf",
+    ) -> None:
+        self._create_new_matrix_in_field(
+            field="varm",
+            name=name,
+            matrix=matrix,
+            matrix_shape=matrix_shape,
+            data_dtype=data_dtype,
+            format=format,
+            compression=compression,
+        )
+        self._link_varm()
+    def create_obsp(
+        self,
+        name: str,
+        matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
+        matrix_shape: Union[tuple[int, int], None] = None,
+        data_dtype: Union[np.dtype, None] = None,
+        format: Union[str, None] = None,
+        compression: str = "lzf",
+    ) -> None:
+        self._create_new_matrix_in_field(
+            field="obsp",
+            name=name,
+            matrix=matrix,
+            matrix_shape=matrix_shape,
+            data_dtype=data_dtype,
+            format=format,
+            compression=compression,
+        )
+        self._link_obsp()
+    def create_varp(
+        self,
+        name: str,
+        matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
+        matrix_shape: Union[tuple[int, int], None] = None,
+        data_dtype: Union[np.dtype, None] = None,
+        format: Union[str, None] = None,
+        compression: str = "lzf",
+    ) -> None:
+        self._create_new_matrix_in_field(
+            field="varp",
+            name=name,
+            matrix=matrix,
+            matrix_shape=matrix_shape,
+            data_dtype=data_dtype,
+            format=format,
+            compression=compression,
+        )
+        self._link_varp()
+    def _create_new_matrix_in_field(self, field, name, **kwargs):
+        """**kwargs: matrix, matrix_shape, data_dtype, format, compression"""
+        dest = f"{field}/{name}"
+        field_entity = getattr(self, field)
+        if name in field_entity.keys():
+            raise ValueError(
+                f"Please explicitly remove the existing '{name}' entity from {field} "
+                f"before creating a new one!"
+            )
+        if field not in self._file.keys():
+            self._file.create_group(field)
+        self._create_new_matrix(dest=dest, **kwargs)
+    def remove_layer(self, name: str) -> None:
+        del self._file[f"layers/{name}"]
+        self._link_layers()
+    def remove_obsp(self, name: str) -> None:
+        del self._file[f"obsp/{name}"]
+        self._link_obsp()
+    def remove_varp(self, name: str) -> None:
+        del self._file[f"varp/{name}"]
+        self._link_varp()
+    def remove_obsm(self, name: str) -> None:
+        del self._file[f"obsm/{name}"]
+        self._link_obsm()
+    def remove_varm(self, name: str) -> None:
+        del self._file[f"varm/{name}"]
+        self._link_varm()
+    def create_repr(self) -> str:
+        indent = " " * 4
+        s = f"CapAnnData object"
+        s += f"\n{indent}File: {self._file}"
+        s += f"\n{indent}X shape: {self.shape}"
+        s += f"\n{indent}Has raw X: {self.raw is not None}"
+        for field in ["obs", "obsm", "var", "uns", "layers"]:
+            if field in self._file:
+                in_memory = set()
+                if field in ["obs", "var", "uns"]:
+                    attr = getattr(self, field)
+                    if attr is not None:
+                        in_memory = set(attr.keys())
+                keys = list(self._file[field].keys())
+                keys = [k for k in keys if k != "_index"]
+                keys = [(k if k not in in_memory else f"{k}*") for k in keys]
+                keys_str = str(keys).replace("*'", "'*")
+                s += f"\n{indent}{field}: {keys_str}"
+        s += f"\n{indent}Note: fields marked with * are in-memory objects."
+        return s
+    def __repr__(self) -> str:
+        return self.create_repr()
+    def __str__(self) -> str:
+        return self.create_repr()
+    def __enter__(self):
+        return self
+    def __exit__(self, *args):
+        if self._file is not None:
+            self._file.close()
+        logger.debug("CapAnnData closed!")

cap_anndata/reader.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
-import contextlib
 import h5py
+import warnings
 from cap_anndata import CapAnnData
@@ -8,7 +8,6 @@ from cap_anndata import CapAnnData
 logger = logging.getLogger(__name__)
-@contextlib.contextmanager
 def read_h5ad(file_path: str, edit: bool = False):
     """
     This is the main read method for CapAnnData.
@@ -21,24 +20,38 @@ def read_h5ad(file_path: str, edit: bool = False):
         file = h5py.File(file_path, mode)
         cap_adata = CapAnnData(file)
         logger.debug(f"Successfully read anndata file path {file_path}")
-        yield cap_adata
+        return cap_adata
     except Exception as error:
-        logger.error(f"Error during read anndata file at path: {file_path}, error = {error}!")
+        logger.error(
+            f"Error during read anndata file at path: {file_path}, error = {error}!"
+        )
         raise error
-    finally:
-        file.close()
-        logger.debug("AnnData closed!")
+def deprecated(message):
+    def deprecated_decorator(func):
+        def deprecated_func(*args, **kwargs):
+            warnings.warn(
+                "{} is a deprecated function. {}".format(func.__name__, message),
+                category=DeprecationWarning,
+                stacklevel=2,
+            )
+            warnings.simplefilter("default", DeprecationWarning)
+            return func(*args, **kwargs)
+        return deprecated_func
+    return deprecated_decorator
+# TODO: remove deprecated function
+@deprecated(
+    "It will be removed in the next version of package. Please replace it with `read_h5ad`."
+)
 def read_directly(file_path: str, edit: bool = False) -> CapAnnData:
     """
     Must be used only in specific cases.
     User is responsible to close the h5py file when the work with CapAnnData instance done.
     """
-    mode = "r+" if edit else "r"
-    logger.debug(f"Read file {file_path} mode={mode} directly...")
-    file = h5py.File(file_path, mode)
-    cap_adata = CapAnnData(file)
-    return cap_adata
+    return read_h5ad(file_path, edit)

cap_anndata-0.3.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,54 @@
+Metadata-Version: 2.1
+Name: cap_anndata
+Version: 0.3.0
+Summary: Partial read/write of AnnData (h5ad) files for low-memory operations with large datasets.
+Home-page: https://github.com/cellannotation/cap-anndata
+Author: R. Mukhin, A. Isaev
+Author-email: roman@ebookapplications.com
+Project-URL: Bug Tracker, https://github.com/cellannotation/cap-anndata/issues
+Classifier: Programming Language :: Python :: 3.9
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: numpy >=1.23.5
+Requires-Dist: pandas >=2.2.0
+Requires-Dist: anndata >=0.10.0
+Provides-Extra: dev
+Requires-Dist: pytest >=8.0.0 ; extra == 'dev'
+Requires-Dist: setuptools ~=69.1.1 ; extra == 'dev'
+# CAP-AnnData: Partial I/O for AnnData (.h5ad) Files
+## Overview
+CAP-AnnData offering functionalities for selective reading and writing of [AnnData](https://pypi.org/project/anndata/)
+file fields without the need for loading entire dataset (or even entire field) into memory.
+For example, it allows to read and modify the single `obs` column taking nothing into memory except the column itself.
+Package eager to replicate the original AnnData API as much as possible,
+while providing additional features for efficient data manipulation for heavy datasets.
+## Installation
+Install CAP-AnnData via pip:
+```commandline
+pip install -U cap-anndata
+```
+## Basic Example
+The example below displayes how to read a single `obs` column, create new obs column and propagate it to the `.h5ad` file.
+```python
+from cap_anndata import read_h5ad
+file_path = "your_data.h5ad"
+with read_h5ad(file_path=file_path, edit=True) as cap_adata:
+    print(cap_adata.obs_keys())  # ['a', 'b', 'c']
+    print(cap_adata.obs) # Empty DataFrame
+    cap_adata.read_obs(columns=['a'])
+    print(cap_adata.obs.columns) # ['a']
+    cap_adata.obs['new_col'] = cap_adata.obs['a']
+    cap_adata.overwrite(fields=['obs'])
+```
+More example can be found in the [How-TO](https://github.com/cellannotation/cap-anndata/blob/main/HOWTO.md) file.

cap_anndata-0.3.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,10 @@
+cap_anndata/__init__.py,sha256=m-iyYXl6oIgczQMXr_rqhoObblRAs37YYxMoWidm7i4,207
+cap_anndata/backed_df.py,sha256=06wZwEjszFQ8lkvy6-GgD_SD14idu9857RtlfMQiBjE,2691
+cap_anndata/backed_dict.py,sha256=jPJl7RxPxV7s5ywD23ZxkInWPrgValyKHmlKZplDuTE,1053
+cap_anndata/cap_anndata.py,sha256=RDozLa-RZoNq_-CWNbrEoLbrNfaD8GkIU8vmAkxFuoQ,21197
+cap_anndata/reader.py,sha256=yiY8kButhg5TDc_OcXNOZkJv5Bbdht3XOzswjgDogdQ,1666
+cap_anndata-0.3.0.dist-info/LICENSE,sha256=JAV0w7TBl6wQe9iFcCKjAWgpurym0f-Q0B75zm2PrKw,1560
+cap_anndata-0.3.0.dist-info/METADATA,sha256=Fj4jPwlPbFr_u-e8-cW2KX5H0bUyhiZ5wcNACGrwK9w,2172
+cap_anndata-0.3.0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
+cap_anndata-0.3.0.dist-info/top_level.txt,sha256=GKi_Uk4LUhXwWBfFCTIyJvEoJqFREt_4uH4CWgeLsg4,12
+cap_anndata-0.3.0.dist-info/RECORD,,

{cap_anndata-0.2.2.dist-info → cap_anndata-0.3.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.42.0)
+Generator: setuptools (75.1.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

cap_anndata/backed_uns.py DELETED Viewed

@@ -1,28 +0,0 @@
-from typing import List, Any
-class CapAnnDataUns(dict):
-    __keys_to_remove: List[str] = []
-    def __delitem__(self, __key: Any) -> None:
-        self.__keys_to_remove.append(__key)
-        return super().__delitem__(__key)
-    def __setitem__(self, __key: Any, __value: Any) -> None:
-        if __key in self.__keys_to_remove:
-            self.__keys_to_remove.remove(__key)
-        return super().__setitem__(__key, __value)
-    @property
-    def keys_to_remove(self):
-        return self.__keys_to_remove
-    def pop(self, __key: Any, __default: Any = None) -> Any:
-        if __key in self:
-            self.__keys_to_remove.append(__key)
-        return super().pop(__key, __default)
-    def popitem(self) -> Any:
-        item = super().popitem()
-        self.__keys_to_remove.append(item[0])
-        return item

cap_anndata-0.2.2.dist-info/METADATA DELETED Viewed

@@ -1,253 +0,0 @@
-Metadata-Version: 2.1
-Name: cap_anndata
-Version: 0.2.2
-Summary: Partial read/write of AnnData (h5ad) files for low-memory operations with large datasets.
-Home-page: https://github.com/cellannotation/cap-anndata
-Author: R. Mukhin, A. Isaev
-Author-email: roman@ebookapplications.com
-Project-URL: Bug Tracker, https://github.com/cellannotation/cap-anndata/issues
-Classifier: Programming Language :: Python :: 3.9
-Classifier: License :: OSI Approved :: BSD License
-Classifier: Operating System :: OS Independent
-Requires-Python: >=3.9
-Description-Content-Type: text/markdown
-License-File: LICENSE
-Requires-Dist: numpy ~=1.26.3
-Requires-Dist: pandas ~=2.2.0
-Requires-Dist: anndata ~=0.10.5
-Requires-Dist: h5py ~=3.5.0
-Provides-Extra: dev
-Requires-Dist: pytest >=8.0.0 ; extra == 'dev'
-Requires-Dist: setuptools ~=69.1.1 ; extra == 'dev'
-# CAP-AnnData: Enhanced Partial I/O for AnnData Files
-## Overview
-CAP-AnnData enriches the AnnData ecosystem by offering tailored functionalities for partial reading and writing of AnnData files. This enhancement allows for selective manipulation of sections such as `obs`, `var`, `X`, `raw.X`, `obsm`, and `uns` without the need for loading entire datasets into memory. Leveraging AnnData's native methods, CAP-AnnData aims to maintain backward compatibility while improving efficiency, especially useful for large-scale single-cell genomics data.
-## Getting Started
-### Installation
-Install CAP-AnnData via pip:
-```commandline
-pip install -U cap-anndata
-```
-### Running Tests
-Ensure the integrity and reliability of CAP-AnnData on your system by running the unit tests via `pytest` from the root of the repo.
-```commandline
-pip install pytest
-pytest test
-```
-Make sure Python 3.9 or newer is used, along with all requirements specified in requirements.txt
-## How-TO:
-#### 1. Access AnnData File DataFrames
-##### Basic Reading
-By default, `CapAnnData` does not automatically read any data. To begin working with dataframes, you need to explicitly read the data from the AnnData file. You can read the entire dataframe or select specific columns. For partial reading, provide a list of column names.
-```python
-from cap_anndata import read_h5ad
-file_path = "your_data.h5ad"
-with read_h5ad(file_path=file_path, edit=False) as cap_adata:
-    # Get the list of all obs columns in AnnData file
-    cap_adata.obs_keys()  # ['a', 'b', 'c']
-    # Read all columns of 'obs'
-    cap_adata.read_obs()
-    # Get the list of columns of DataFrame in memory
-    cap_adata.obs.columns  # ['a', 'b', 'c']
-    # Get the list of all var columns in AnnData file
-    cap_adata.var_keys()  # ['d', 'e', 'f']
-    # Read specific columns of 'var'
-    cap_adata.read_var(columns=['d'])
-    cap_adata.var.columns  # ['d']
-    # Read additional column
-    cap_adata.read_var(columns=['e'])
-    cap_adata.var.columns  # ['d', 'e']
-    # Read column and reset the in-memory DataFrame before that
-    cap_adata.read_var(columns=['f'], reset=True)
-    cap_adata.var.columns  # ['f']
-    # Read no columns of raw.var (only the index)
-    cap_adata.raw.read_var(columns=[])
-```
-##### Difference between `obs_keys()` and `obs.columns`
-`obs_keys()` returns the list of columns in the on-disc AnnData file, while `obs.columns` returns the list of columns in the in-memory DataFrame. The two lists may differ if you read only specific columns. If you modify the in-memory DataFrame, the `obs_keys()` will reflect the changes. BTW it is recommended to check the `obs_keys()` before the `overwrite()` call to avoid the AnnData file damage.
-If a column doesn't exist in the file, no error will be raised but the column will be missing in the resulting DataFrame. So, the list of columns saying more like "try to read this columns from the file". It is needed because we there is no way yet to check if the column exists before the read. Exactly the same behavior is for the `var_keys()` and `var.columns`.
-#### 2. Modify the AnnData File DataFrames In-Place
-You can directly modify the dataframe by adding, renaming, or removing columns.
-```python
-# Create a new column
-cap_adata.obs['new_col'] = [value1, value2, value3]
-# Rename a column
-cap_adata.obs.rename_column('old_col_name', 'new_col_name')
-# Remove a column
-cap_adata.obs.remove_column('col_to_remove')
-```
-After modifications, you can overwrite the changes back to the AnnData file. If a value doesn't exist, it will be created.
-Note: `read_h5ad` must be called with `edit=True` argument to open `.h5ad` file in `r+` mode.
-```python
-# overwrite all values which were read
-cap_adata.overwrite()
-# overwrite choosen fields
-cap_adata.overwrite(['obs', 'var'])
-```
-The full list of supported fields: `obs`, `var`, `raw.var`, `obsm`, `uns`.
-#### 3. How to Read Few Columns but Overwrite One in a Dataframe
-The only way yet to do that is to drop all columns from in-memory dataframe (with `pandas.drop`!) before the call of `overwrite` method.
-```python
-# Read specific columns
-cap_adata.read_obs(columns=['cell_type', 'sample'])
-# Drop a column in-memory
-# DON'T USE remove_column here!
-cap_adata.obs.drop(columns='sample', inplace=True)
-# Overwrite changes
-cap_adata.overwrite(['obs'])
-# NOTE that the line
-# cap_adata.read_obs(columns=['sample'], reset=True)
-# Will override in-memory changes with values from the AnnData file
-```
-#### 4. How to work with X and raw.X
-The CapAnnData package won't read any field by default. However, the `X` and `raw.X` will be linked to the backed matrices automatically upon the first request to those fields.
-The X object will be returned as the `h5py.Dataset` or `AnnData.experimental.sparse_dataset`.
-```python
-with read_h5ad(file_path=file_path, edit=False) as cap_adata:
-    # self.X is None here
-    cap_adata = CapAnnData(file)
-    # will return the h5py.Dataset or CSRDataset
-    x = cap_adata.X
-    # The same for raw.X
-    raw_x = cap_adata.raw.X
-    # take whole matrix in memory
-    x = cap_adata.X[:]
-```
-The CapAnnData supports the standard `numpy`/`h5py` sclising rules
-```python
-# slice rows
-s_ = np.s_[0:5]
-# slice columns
-s_ = np.s_[:, 0:5]
-# boolean mask + slicing
-mask = np.array([i < 5 for i in range(adata.shape[0])])
-s_ = np.s_[mask, :5]
-```
-#### 5. How to handle obsm embeddings matrixes
-By the default the CapAnnData will not read the embeddings matrix.
-The link to the h5py objects will be created upon the first call of the `.obsm` property.
-Alike the AnnData package the call like `cap_adata.obsm["X_tsne"]` will not return the in-memory matrix but will return the backed version instead.
-It is possible to get the information about the name and shape of the embeddings without taking the whole matrix in the memory.
-```python
-with read_h5ad(file_path=file_path, edit=False) as cap_adata:
-    # will return the list of strings
-    obsm_keys = cap_adata.obsm_keys()
-    # return the shape of the matrix in backed mode
-    embeddings = obsm_keys[0]
-    shape = cap_adata.obsm[embeddings].shape
-    # take the whole matrix in memory
-    matrix = cap_adata.obsm[embeddings][:]
-```
-#### 6. How to read and modify uns section
-The `CapAnnData` class will lazely link the uns section upon the first call but ***WILL NOT*** read it into memory. Instead, the dictionary of the pairs `{'key': "__NotLinkedObject"}` will be creted. It allow to get the list of keys before the actual read. To read the uns section in the memory the `.read_uns(keys)` method must be called.
-```python
-with read_h5ad(file_path=file_path, edit=True) as cap_adata:
-    # will return the keys() object
-    keys = cap_adata.uns.keys()
-    # read in memory the first key only
-    cap_adata.read_uns([keys[0]])
-    # read the whole uns section into memory
-    cap_adata.read_uns()
-```
-Since the `.uns` section is in the memory (partially or completely) we can work with it as with the regular `dict()` python object. The main feature of the `CapAnnDataUns` class which inherited from `dict` is the tracking of the keys which must be removed from the `.h5ad` file upon overwrite.
-```python
-# get the value
-v = cap_adata.uns["key1"]
-v = cap_adata.uns.get("key1")
-# modify values
-cap_adata.uns["key1"] = "new_value"
-# create new keys
-cap_adata.uns["new_key"] = "value"
-# remove keys
-cap_adata.uns.pop("key1")  # is recommended way
-del cap_adata.uns.pop("key2")
-cap_adata.uns.popitem()
-```
-To save `uns` changes the method `CapAnnData.overwrite()` must be called.
-```python
-cap_adata.overwrite()  # all in-memory fields will be overwritten
-cap_adata.overwrite(["uns"])  # overwrite the uns secion only
-```
-#### 7. Join and Merge DataFrames
-Cap-AnnData provides enhanced methods for joining and merging dataframes, preserving column order and data integrity
-```python
-from cap_anndata import CapAnnDataDF
-import pandas as pd
-data1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
-data2 = pd.DataFrame({'D': [7, 8, 9], 'E': [10, 11, 12]})
-cap_anndata_df1 = CapAnnDataDF.from_df(data1, column_order=['A', 'B', 'C'])
-cap_df = cap_anndata_df1.join(data2, how='left')
-cap_df.columns  # ['A', 'B', 'D', 'E']
-cap_df.column_order  # ['A', 'B', 'C', 'D', 'E']
-data3 = pd.DataFrame({'A': [2, 3, 4], 'D': [10, 11, 12]})
-cap_df = cap_anndata_df1.merge(data3, on='A')
-cap_df.columns  # ['A', 'B', 'D']
-cap_df.column_order  # ['A', 'B', 'C', 'D']
-cap_df.shape  # (2, 3)
-```

cap_anndata-0.2.2.dist-info/RECORD DELETED Viewed

@@ -1,10 +0,0 @@
-cap_anndata/__init__.py,sha256=l9lvFpcMsQksp8_dI-fjUgrImoMdztbu3jVSdmxNPmA,205
-cap_anndata/backed_df.py,sha256=06wZwEjszFQ8lkvy6-GgD_SD14idu9857RtlfMQiBjE,2691
-cap_anndata/backed_uns.py,sha256=Tfxoz3RgcgENf4SvxFOox9w048K2QmBTh1VbAf4yqVI,854
-cap_anndata/cap_anndata.py,sha256=fEaIwWIKKDJpIsQ7cwOfUTmUReIyryv5qRDqRjRsWhU,10185
-cap_anndata/reader.py,sha256=kg9xoS_S0gY6WpsHE8PwGMa14VXh9Ibqjw4bwoerYsE,1267
-cap_anndata-0.2.2.dist-info/LICENSE,sha256=JAV0w7TBl6wQe9iFcCKjAWgpurym0f-Q0B75zm2PrKw,1560
-cap_anndata-0.2.2.dist-info/METADATA,sha256=h41dgoz3w2rDHnic828FahjEoKq1lt_Bi1jm-ZX-goA,9569
-cap_anndata-0.2.2.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
-cap_anndata-0.2.2.dist-info/top_level.txt,sha256=GKi_Uk4LUhXwWBfFCTIyJvEoJqFREt_4uH4CWgeLsg4,12
-cap_anndata-0.2.2.dist-info/RECORD,,

{cap_anndata-0.2.2.dist-info → cap_anndata-0.3.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{cap_anndata-0.2.2.dist-info → cap_anndata-0.3.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

cap-anndata 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl

cap-anndata 0.2.2py3-none-any.whl → 0.3.0py3-none-any.whl