cap-anndata 0.3.1__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cap_anndata/backed_df.py CHANGED
@@ -1,13 +1,10 @@
1
1
  import pandas as pd
2
2
  import numpy as np
3
3
  from typing import List, Any, Union
4
- import logging
5
4
 
6
5
  from pandas._typing import Self
7
6
  from pandas.core.generic import bool_t
8
7
 
9
- logger = logging.getLogger(__name__)
10
-
11
8
 
12
9
  class CapAnnDataDF(pd.DataFrame):
13
10
  """
@@ -19,26 +16,37 @@ class CapAnnDataDF(pd.DataFrame):
19
16
 
20
17
  _metadata = ["column_order"]
21
18
 
19
+ def column_order_array(self) -> np.array:
20
+ order = self.column_order
21
+ if order is not None and isinstance(order, List):
22
+ # Convert it to numpy array of str elements
23
+ return np.array(order, dtype=object)
24
+ else:
25
+ return order
26
+
22
27
  def rename_column(self, old_name: str, new_name: str) -> None:
23
- i = np.where(self.column_order == old_name)[0]
24
- self.column_order[i] = new_name
28
+ i = np.where(self.column_order_array() == old_name)[0]
29
+ tmp_array = self.column_order_array().copy()
30
+ tmp_array[i] = new_name
31
+ self.column_order = tmp_array.copy()
25
32
  self.rename(columns={old_name: new_name}, inplace=True)
26
33
 
27
34
  def remove_column(self, col_name: str) -> None:
28
- i = np.where(self.column_order == col_name)[0]
29
- self.column_order = np.delete(self.column_order, i)
35
+ i = np.where(self.column_order_array() == col_name)[0]
36
+ self.column_order = np.delete(self.column_order_array(), i)
30
37
  self.drop(columns=[col_name], inplace=True)
31
38
 
32
39
  def __setitem__(self, key, value) -> None:
33
- if key not in self.column_order:
34
- self.column_order = np.append(self.column_order, key)
40
+ if key not in self.column_order_array():
41
+ self.column_order = np.append(self.column_order_array(), key)
35
42
  return super().__setitem__(key, value)
36
43
 
37
44
  @classmethod
38
- def from_df(cls, df: pd.DataFrame, column_order: List[str] = None) -> Self:
45
+ def from_df(cls, df: pd.DataFrame, column_order: Union[np.array, List[str], None] = None) -> Self:
39
46
  if column_order is None:
40
47
  column_order = df.columns.to_numpy()
41
-
48
+ elif isinstance(column_order, List):
49
+ column_order = np.array(column_order)
42
50
  new_inst = cls(df)
43
51
  new_inst.column_order = column_order
44
52
  return new_inst
@@ -47,23 +55,27 @@ class CapAnnDataDF(pd.DataFrame):
47
55
  result = super().join(other=other, **kwargs)
48
56
  if isinstance(other, CapAnnDataDF):
49
57
  new_columns = [
50
- col for col in other.column_order if col not in self.column_order
58
+ col for col in other.column_order_array() if col not in self.column_order_array()
51
59
  ]
52
60
  else:
53
- new_columns = [col for col in other.columns if col not in self.column_order]
54
- column_order = np.append(self.column_order, new_columns)
55
- return self.from_df(result, column_order=column_order)
61
+ new_columns = [col for col in other.columns if col not in self.column_order_array()]
62
+ column_order = np.append(self.column_order_array(), new_columns)
63
+ df = self.from_df(result, column_order=column_order)
64
+ return df
56
65
 
57
66
  def merge(self, right, **kwargs) -> Self:
58
67
  result = super().merge(right=right, **kwargs)
59
68
  if isinstance(right, CapAnnDataDF):
60
69
  new_columns = [
61
- col for col in right.column_order if col not in self.column_order
70
+ col for col in right.column_order_array() if col not in self.column_order_array()
62
71
  ]
63
72
  else:
64
- new_columns = [col for col in right.columns if col not in self.column_order]
65
- column_order = np.append(self.column_order, new_columns)
66
- return self.from_df(result, column_order=column_order)
73
+ new_columns = [col for col in right.columns if col not in self.column_order_array()]
74
+ column_order = np.append(self.column_order_array(), new_columns)
75
+ df = self.from_df(result, column_order=column_order)
76
+ return df
67
77
 
68
78
  def copy(self, deep: Union[bool_t, None] = True) -> Self:
69
- return self.from_df(super().copy(deep=deep), column_order=self.column_order)
79
+ column_order = self.column_order_array()
80
+ df = self.from_df(super().copy(deep=deep), column_order=column_order)
81
+ return df
@@ -7,19 +7,33 @@ import scipy.sparse as ss
7
7
  from packaging import version
8
8
 
9
9
  if version.parse(ad.__version__) < version.parse("0.11.0"):
10
- from anndata.experimental import sparse_dataset, read_elem, write_elem
10
+ from anndata.experimental import (
11
+ sparse_dataset,
12
+ read_elem,
13
+ write_elem,
14
+ CSRDataset,
15
+ CSCDataset,
16
+ )
11
17
  else:
12
- from anndata.io import sparse_dataset, read_elem, write_elem
18
+ from anndata.io import (
19
+ sparse_dataset,
20
+ read_elem,
21
+ write_elem,
22
+ )
23
+ from anndata.abc import (
24
+ CSRDataset,
25
+ CSCDataset,
26
+ )
13
27
 
14
28
  from cap_anndata import CapAnnDataDF, CapAnnDataDict
15
29
 
16
30
  logger = logging.getLogger(__name__)
17
31
 
18
32
  X_NOTATION = Union[
19
- h5py.Dataset, ad.experimental.CSRDataset, ad.experimental.CSCDataset, None
33
+ h5py.Dataset, CSRDataset, CSCDataset, None
20
34
  ]
21
35
  ARRAY_MAPPING_NOTATION = CapAnnDataDict[str, X_NOTATION]
22
-
36
+ FIELDS_SUPPORTED_TO_OVERWRITE = ["obs", "var", "raw.var", "uns", "layers", "obsm", "varm", "obsp", "varp"]
23
37
  NotLinkedObject: Final = "__NotLinkedObject"
24
38
 
25
39
 
@@ -57,15 +71,7 @@ class BaseLayerMatrixAndDf:
57
71
  return shape
58
72
 
59
73
  def _lazy_df_load(self, key: str) -> CapAnnDataDF:
60
- df = CapAnnDataDF()
61
- attribute = self._path_to_content + key
62
- column_order = self._read_attr(self._file[attribute], "column-order")
63
- df.column_order = column_order
64
- if df.column_order.dtype != object:
65
- # empty DataFrame will have column_order as float64
66
- # which leads to failure in overwrite method
67
- df.column_order = df.column_order.astype(object)
68
- return df
74
+ return self._read_df(key=key, columns=[])
69
75
 
70
76
  @staticmethod
71
77
  def _read_attr(obj: Union[h5py.Group, h5py.Dataset], attr_name: str) -> any:
@@ -93,8 +99,10 @@ class BaseLayerMatrixAndDf:
93
99
  cols_to_read = [c for c in columns if c in column_order]
94
100
  df = CapAnnDataDF()
95
101
  df.column_order = column_order
102
+
96
103
  index_col = self._read_attr(h5_group, "_index")
97
- df.index = read_elem(h5_group[index_col])
104
+ index = read_elem(h5_group[index_col])
105
+ df.index = index
98
106
 
99
107
  for col in cols_to_read:
100
108
  df[col] = read_elem(h5_group[col])
@@ -135,15 +143,19 @@ class BaseLayerMatrixAndDf:
135
143
  if not isinstance(group, h5py.Group):
136
144
  raise ValueError(f"The object {key} must be a group!")
137
145
 
138
- for array_name in group.keys():
139
- array = group[array_name]
140
- if isinstance(array, h5py.Dataset):
141
- cap_dict[array_name] = array
142
- elif isinstance(array, h5py.Group):
143
- cap_dict[array_name] = sparse_dataset(array)
146
+ for entity_name in group.keys():
147
+ entity = group[entity_name]
148
+ if isinstance(entity, h5py.Dataset):
149
+ cap_dict[entity_name] = entity
150
+ elif isinstance(entity, h5py.Group):
151
+ enc_type = dict(entity.attrs).get("encoding-type")
152
+ if enc_type == "dataframe":
153
+ cap_dict[entity_name] = self._read_df(key="/".join([key, entity_name]), columns=None)
154
+ elif enc_type in ["csc_matrix", "csr_matrix"]:
155
+ cap_dict[entity_name] = sparse_dataset(entity)
144
156
  else:
145
157
  raise ValueError(
146
- f"Can't link array in {key} due to unsupported type of object: {type(array)}"
158
+ f"Can't link array in {key} due to unsupported type of object: {type(entity)}"
147
159
  )
148
160
 
149
161
  def _create_new_matrix(
@@ -252,11 +264,11 @@ class CapAnnData(BaseLayerMatrixAndDf):
252
264
  def raw(self) -> RawLayer:
253
265
  if self._raw is None:
254
266
  if "raw" not in self._file.keys():
255
- logger.warning("Can't read raw.var since raw layer doesn't exist!")
267
+ logger.debug("Can't read raw.var since raw layer doesn't exist!")
256
268
  return
257
269
 
258
270
  if len(self._file["raw"].keys()) == 0:
259
- logger.warning("The raw layer is empty!")
271
+ logger.debug("The raw layer is empty!")
260
272
  return
261
273
 
262
274
  self._raw = RawLayer(self._file)
@@ -366,37 +378,43 @@ class CapAnnData(BaseLayerMatrixAndDf):
366
378
  return list(self.obsm.keys())
367
379
 
368
380
  def obs_keys(self) -> List[str]:
369
- return self.obs.column_order.tolist()
381
+ return self.obs.column_order_array().tolist()
370
382
 
371
383
  def var_keys(self) -> List[str]:
372
- return self.var.column_order.tolist()
384
+ return self.var.column_order_array().tolist()
385
+
386
+ def field_to_entity(self, key):
387
+ if key == "obs":
388
+ return self.obs
389
+ elif key == "var":
390
+ return self.var
391
+ elif key == "raw.var":
392
+ return self.raw.var if self.raw is not None else None
393
+ elif key == "uns":
394
+ return self.uns
395
+ elif key == "layers":
396
+ return self.layers
397
+ elif key == "obsm":
398
+ return self.obsm
399
+ elif key == "varm":
400
+ return self.varm
401
+ elif key == "obsp":
402
+ return self.obsp
403
+ elif key == "varp":
404
+ return self.varp
405
+ else:
406
+ raise KeyError(
407
+ f"The field {key} is not supported! The list of supported fields are equal to {FIELDS_SUPPORTED_TO_OVERWRITE} "
408
+ f"attributes of the CapAnnData class."
409
+ )
373
410
 
374
411
  def overwrite(self, fields: List[str] = None, compression: str = "lzf") -> None:
375
- field_to_entity = {
376
- "obs": self.obs,
377
- "var": self.var,
378
- "raw.var": self.raw.var if self.raw is not None else None,
379
- "uns": self.uns,
380
- "layers": self.layers,
381
- "obsm": self.obsm,
382
- "varm": self.varm,
383
- "obsp": self.obsp,
384
- "varp": self.varp,
385
- }
386
-
387
412
  if fields is None:
388
- fields = list(field_to_entity.keys())
389
- else:
390
- for f in fields:
391
- if f not in field_to_entity.keys():
392
- raise KeyError(
393
- f"The field {f} is not supported! The list of supported fields are equal to supported "
394
- f"attributes of the CapAnnData class: obs, var, raw.var and uns."
395
- )
413
+ fields = FIELDS_SUPPORTED_TO_OVERWRITE
396
414
 
397
415
  for key in ["obs", "var", "raw.var"]:
398
416
  if key in fields:
399
- entity: CapAnnDataDF = field_to_entity[key]
417
+ entity: CapAnnDataDF = self.field_to_entity(key)
400
418
  if entity is None:
401
419
  continue
402
420
 
@@ -407,11 +425,22 @@ class CapAnnData(BaseLayerMatrixAndDf):
407
425
  f"{key}/{col}", entity[col].values, compression=compression
408
426
  )
409
427
 
410
- column_order = entity.column_order
428
+ column_order = entity.column_order_array()
411
429
  if (
412
430
  column_order.size == 0
413
431
  ): # Refs https://github.com/cellannotation/cap-anndata/issues/6
414
432
  column_order = np.array([], dtype=np.float64)
433
+
434
+ # Index update
435
+ index_name = entity.index.name
436
+ if not index_name:
437
+ index_name = "_index"
438
+ self._file[key].attrs["_index"] = index_name
439
+ index_col = self._read_attr(self._file[key], "_index")
440
+ self._write_elem(
441
+ f"{key}/{index_col}", entity.index.to_numpy(), compression=compression
442
+ )
443
+
415
444
  self._file[key].attrs["column-order"] = column_order
416
445
 
417
446
  if "uns" in fields:
@@ -424,7 +453,7 @@ class CapAnnData(BaseLayerMatrixAndDf):
424
453
 
425
454
  for field in ["layers", "obsm", "varm", "obsp", "varp"]:
426
455
  if field in fields:
427
- for key in field_to_entity[field].keys_to_remove:
456
+ for key in self.field_to_entity(field).keys_to_remove:
428
457
  del self._file[f"{field}/{key}"]
429
458
 
430
459
  def create_layer(
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: cap_anndata
3
- Version: 0.3.1
3
+ Version: 0.5.0
4
4
  Summary: Partial read/write of AnnData (h5ad) files for low-memory operations with large datasets.
5
5
  Home-page: https://github.com/cellannotation/cap-anndata
6
6
  Author: R. Mukhin, A. Isaev
@@ -14,15 +14,29 @@ Classifier: Operating System :: OS Independent
14
14
  Requires-Python: >=3.9
15
15
  Description-Content-Type: text/markdown
16
16
  License-File: LICENSE
17
- Requires-Dist: numpy >=1.23.5
18
- Requires-Dist: pandas >=2.2.0
19
- Requires-Dist: anndata >=0.10.0
17
+ Requires-Dist: numpy>=1.23.5
18
+ Requires-Dist: pandas>=2.2.0
19
+ Requires-Dist: anndata>=0.10.0
20
20
  Provides-Extra: dev
21
- Requires-Dist: pytest >=8.0.0 ; extra == 'dev'
22
- Requires-Dist: setuptools ~=69.1.1 ; extra == 'dev'
21
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
22
+ Requires-Dist: setuptools~=69.1.1; extra == "dev"
23
+ Dynamic: author
24
+ Dynamic: author-email
25
+ Dynamic: classifier
26
+ Dynamic: description
27
+ Dynamic: description-content-type
28
+ Dynamic: home-page
29
+ Dynamic: license-file
30
+ Dynamic: project-url
31
+ Dynamic: provides-extra
32
+ Dynamic: requires-dist
33
+ Dynamic: requires-python
34
+ Dynamic: summary
23
35
 
24
36
  # CAP-AnnData: Partial I/O for AnnData (.h5ad) Files
25
37
 
38
+ [![PyPI version](https://img.shields.io/pypi/v/cap-anndata)](https://pypi.org/project/cap-anndata/) [![Build Status](https://github.com/cellannotation/cap-anndata/actions/workflows/python-app.yml/badge.svg)](https://github.com/cellannotation/cap-anndata/actions)
39
+
26
40
  ## Overview
27
41
  CAP-AnnData offering functionalities for selective reading and writing of [AnnData](https://pypi.org/project/anndata/)
28
42
  file fields without the need for loading entire dataset (or even entire field) into memory.
@@ -0,0 +1,10 @@
1
+ cap_anndata/__init__.py,sha256=WRAQEDsWTvLbJWVUA5FmKCVrD2GN4oRd5I3c8jc9ajo,197
2
+ cap_anndata/backed_df.py,sha256=2OVomvTY51V05sYwEXg-4JYBgd9iJCA2-Lt7nEAL1Ug,3255
3
+ cap_anndata/backed_dict.py,sha256=Hb1SjnKuQ13mBUitQ5sL3kmcQ1j3GgB19r3yXkC0oIo,1019
4
+ cap_anndata/cap_anndata.py,sha256=4sro4BIsaOuTBHrRXYCi0WlGtxsql_bnqIDEpT2tRhQ,21371
5
+ cap_anndata/reader.py,sha256=UpZBCjaS4-K2w_9m6IuYetO9LwmEEJ5KvAw9aAoMRno,1609
6
+ cap_anndata-0.5.0.dist-info/licenses/LICENSE,sha256=XXTH6JikkxH7Gqy9VEj4crSizuwxzv04ROzkQ-ZS6o4,1532
7
+ cap_anndata-0.5.0.dist-info/METADATA,sha256=CbdJemeEOB1hIJ7tPrVOT7JldkVNOiJ6zkW8AFqTjqU,2825
8
+ cap_anndata-0.5.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
+ cap_anndata-0.5.0.dist-info/top_level.txt,sha256=GKi_Uk4LUhXwWBfFCTIyJvEoJqFREt_4uH4CWgeLsg4,12
10
+ cap_anndata-0.5.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.3.0)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,10 +0,0 @@
1
- cap_anndata/__init__.py,sha256=WRAQEDsWTvLbJWVUA5FmKCVrD2GN4oRd5I3c8jc9ajo,197
2
- cap_anndata/backed_df.py,sha256=bMNsArbPjA-TN7eQB4-9Y2l3s8o03-dM4hPnOR9tROc,2622
3
- cap_anndata/backed_dict.py,sha256=Hb1SjnKuQ13mBUitQ5sL3kmcQ1j3GgB19r3yXkC0oIo,1019
4
- cap_anndata/cap_anndata.py,sha256=uQh49Kwu2cE4-ebgOvb78mMGA_afkZcsr71j6f8EX2I,20600
5
- cap_anndata/reader.py,sha256=UpZBCjaS4-K2w_9m6IuYetO9LwmEEJ5KvAw9aAoMRno,1609
6
- cap_anndata-0.3.1.dist-info/LICENSE,sha256=XXTH6JikkxH7Gqy9VEj4crSizuwxzv04ROzkQ-ZS6o4,1532
7
- cap_anndata-0.3.1.dist-info/METADATA,sha256=688YuF45IuOvu1Hqxbt_O1aeYkoMX4tjV0b2hb1WY8I,2304
8
- cap_anndata-0.3.1.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
9
- cap_anndata-0.3.1.dist-info/top_level.txt,sha256=GKi_Uk4LUhXwWBfFCTIyJvEoJqFREt_4uH4CWgeLsg4,12
10
- cap_anndata-0.3.1.dist-info/RECORD,,