cap-anndata 0.3.1__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: cap_anndata
3
- Version: 0.3.1
3
+ Version: 0.5.0
4
4
  Summary: Partial read/write of AnnData (h5ad) files for low-memory operations with large datasets.
5
5
  Home-page: https://github.com/cellannotation/cap-anndata
6
6
  Author: R. Mukhin, A. Isaev
@@ -20,9 +20,23 @@ Requires-Dist: anndata>=0.10.0
20
20
  Provides-Extra: dev
21
21
  Requires-Dist: pytest>=8.0.0; extra == "dev"
22
22
  Requires-Dist: setuptools~=69.1.1; extra == "dev"
23
+ Dynamic: author
24
+ Dynamic: author-email
25
+ Dynamic: classifier
26
+ Dynamic: description
27
+ Dynamic: description-content-type
28
+ Dynamic: home-page
29
+ Dynamic: license-file
30
+ Dynamic: project-url
31
+ Dynamic: provides-extra
32
+ Dynamic: requires-dist
33
+ Dynamic: requires-python
34
+ Dynamic: summary
23
35
 
24
36
  # CAP-AnnData: Partial I/O for AnnData (.h5ad) Files
25
37
 
38
+ [![PyPI version](https://img.shields.io/pypi/v/cap-anndata)](https://pypi.org/project/cap-anndata/) [![Build Status](https://github.com/cellannotation/cap-anndata/actions/workflows/python-app.yml/badge.svg)](https://github.com/cellannotation/cap-anndata/actions)
39
+
26
40
  ## Overview
27
41
  CAP-AnnData offering functionalities for selective reading and writing of [AnnData](https://pypi.org/project/anndata/)
28
42
  file fields without the need for loading entire dataset (or even entire field) into memory.
@@ -1,5 +1,7 @@
1
1
  # CAP-AnnData: Partial I/O for AnnData (.h5ad) Files
2
2
 
3
+ [![PyPI version](https://img.shields.io/pypi/v/cap-anndata)](https://pypi.org/project/cap-anndata/) [![Build Status](https://github.com/cellannotation/cap-anndata/actions/workflows/python-app.yml/badge.svg)](https://github.com/cellannotation/cap-anndata/actions)
4
+
3
5
  ## Overview
4
6
  CAP-AnnData offering functionalities for selective reading and writing of [AnnData](https://pypi.org/project/anndata/)
5
7
  file fields without the need for loading entire dataset (or even entire field) into memory.
@@ -0,0 +1,81 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from typing import List, Any, Union
4
+
5
+ from pandas._typing import Self
6
+ from pandas.core.generic import bool_t
7
+
8
+
9
+ class CapAnnDataDF(pd.DataFrame):
10
+ """
11
+ The class to expand the pandas DataFrame behaviour to support partial
12
+ reading and writing of AnnData obs and var (raw.var) fields.
13
+ The main feature of the class is handling <column-order> attribute
14
+ which must be a copy of h5py.Group attribute
15
+ """
16
+
17
+ _metadata = ["column_order"]
18
+
19
+ def column_order_array(self) -> np.array:
20
+ order = self.column_order
21
+ if order is not None and isinstance(order, List):
22
+ # Convert it to numpy array of str elements
23
+ return np.array(order, dtype=object)
24
+ else:
25
+ return order
26
+
27
+ def rename_column(self, old_name: str, new_name: str) -> None:
28
+ i = np.where(self.column_order_array() == old_name)[0]
29
+ tmp_array = self.column_order_array().copy()
30
+ tmp_array[i] = new_name
31
+ self.column_order = tmp_array.copy()
32
+ self.rename(columns={old_name: new_name}, inplace=True)
33
+
34
+ def remove_column(self, col_name: str) -> None:
35
+ i = np.where(self.column_order_array() == col_name)[0]
36
+ self.column_order = np.delete(self.column_order_array(), i)
37
+ self.drop(columns=[col_name], inplace=True)
38
+
39
+ def __setitem__(self, key, value) -> None:
40
+ if key not in self.column_order_array():
41
+ self.column_order = np.append(self.column_order_array(), key)
42
+ return super().__setitem__(key, value)
43
+
44
+ @classmethod
45
+ def from_df(cls, df: pd.DataFrame, column_order: Union[np.array, List[str], None] = None) -> Self:
46
+ if column_order is None:
47
+ column_order = df.columns.to_numpy()
48
+ elif isinstance(column_order, List):
49
+ column_order = np.array(column_order)
50
+ new_inst = cls(df)
51
+ new_inst.column_order = column_order
52
+ return new_inst
53
+
54
+ def join(self, other: Any, **kwargs) -> Self:
55
+ result = super().join(other=other, **kwargs)
56
+ if isinstance(other, CapAnnDataDF):
57
+ new_columns = [
58
+ col for col in other.column_order_array() if col not in self.column_order_array()
59
+ ]
60
+ else:
61
+ new_columns = [col for col in other.columns if col not in self.column_order_array()]
62
+ column_order = np.append(self.column_order_array(), new_columns)
63
+ df = self.from_df(result, column_order=column_order)
64
+ return df
65
+
66
+ def merge(self, right, **kwargs) -> Self:
67
+ result = super().merge(right=right, **kwargs)
68
+ if isinstance(right, CapAnnDataDF):
69
+ new_columns = [
70
+ col for col in right.column_order_array() if col not in self.column_order_array()
71
+ ]
72
+ else:
73
+ new_columns = [col for col in right.columns if col not in self.column_order_array()]
74
+ column_order = np.append(self.column_order_array(), new_columns)
75
+ df = self.from_df(result, column_order=column_order)
76
+ return df
77
+
78
+ def copy(self, deep: Union[bool_t, None] = True) -> Self:
79
+ column_order = self.column_order_array()
80
+ df = self.from_df(super().copy(deep=deep), column_order=column_order)
81
+ return df
@@ -7,19 +7,33 @@ import scipy.sparse as ss
7
7
  from packaging import version
8
8
 
9
9
  if version.parse(ad.__version__) < version.parse("0.11.0"):
10
- from anndata.experimental import sparse_dataset, read_elem, write_elem
10
+ from anndata.experimental import (
11
+ sparse_dataset,
12
+ read_elem,
13
+ write_elem,
14
+ CSRDataset,
15
+ CSCDataset,
16
+ )
11
17
  else:
12
- from anndata.io import sparse_dataset, read_elem, write_elem
18
+ from anndata.io import (
19
+ sparse_dataset,
20
+ read_elem,
21
+ write_elem,
22
+ )
23
+ from anndata.abc import (
24
+ CSRDataset,
25
+ CSCDataset,
26
+ )
13
27
 
14
28
  from cap_anndata import CapAnnDataDF, CapAnnDataDict
15
29
 
16
30
  logger = logging.getLogger(__name__)
17
31
 
18
32
  X_NOTATION = Union[
19
- h5py.Dataset, ad.experimental.CSRDataset, ad.experimental.CSCDataset, None
33
+ h5py.Dataset, CSRDataset, CSCDataset, None
20
34
  ]
21
35
  ARRAY_MAPPING_NOTATION = CapAnnDataDict[str, X_NOTATION]
22
-
36
+ FIELDS_SUPPORTED_TO_OVERWRITE = ["obs", "var", "raw.var", "uns", "layers", "obsm", "varm", "obsp", "varp"]
23
37
  NotLinkedObject: Final = "__NotLinkedObject"
24
38
 
25
39
 
@@ -57,15 +71,7 @@ class BaseLayerMatrixAndDf:
57
71
  return shape
58
72
 
59
73
  def _lazy_df_load(self, key: str) -> CapAnnDataDF:
60
- df = CapAnnDataDF()
61
- attribute = self._path_to_content + key
62
- column_order = self._read_attr(self._file[attribute], "column-order")
63
- df.column_order = column_order
64
- if df.column_order.dtype != object:
65
- # empty DataFrame will have column_order as float64
66
- # which leads to failure in overwrite method
67
- df.column_order = df.column_order.astype(object)
68
- return df
74
+ return self._read_df(key=key, columns=[])
69
75
 
70
76
  @staticmethod
71
77
  def _read_attr(obj: Union[h5py.Group, h5py.Dataset], attr_name: str) -> any:
@@ -93,8 +99,10 @@ class BaseLayerMatrixAndDf:
93
99
  cols_to_read = [c for c in columns if c in column_order]
94
100
  df = CapAnnDataDF()
95
101
  df.column_order = column_order
102
+
96
103
  index_col = self._read_attr(h5_group, "_index")
97
- df.index = read_elem(h5_group[index_col])
104
+ index = read_elem(h5_group[index_col])
105
+ df.index = index
98
106
 
99
107
  for col in cols_to_read:
100
108
  df[col] = read_elem(h5_group[col])
@@ -135,15 +143,19 @@ class BaseLayerMatrixAndDf:
135
143
  if not isinstance(group, h5py.Group):
136
144
  raise ValueError(f"The object {key} must be a group!")
137
145
 
138
- for array_name in group.keys():
139
- array = group[array_name]
140
- if isinstance(array, h5py.Dataset):
141
- cap_dict[array_name] = array
142
- elif isinstance(array, h5py.Group):
143
- cap_dict[array_name] = sparse_dataset(array)
146
+ for entity_name in group.keys():
147
+ entity = group[entity_name]
148
+ if isinstance(entity, h5py.Dataset):
149
+ cap_dict[entity_name] = entity
150
+ elif isinstance(entity, h5py.Group):
151
+ enc_type = dict(entity.attrs).get("encoding-type")
152
+ if enc_type == "dataframe":
153
+ cap_dict[entity_name] = self._read_df(key="/".join([key, entity_name]), columns=None)
154
+ elif enc_type in ["csc_matrix", "csr_matrix"]:
155
+ cap_dict[entity_name] = sparse_dataset(entity)
144
156
  else:
145
157
  raise ValueError(
146
- f"Can't link array in {key} due to unsupported type of object: {type(array)}"
158
+ f"Can't link array in {key} due to unsupported type of object: {type(entity)}"
147
159
  )
148
160
 
149
161
  def _create_new_matrix(
@@ -252,11 +264,11 @@ class CapAnnData(BaseLayerMatrixAndDf):
252
264
  def raw(self) -> RawLayer:
253
265
  if self._raw is None:
254
266
  if "raw" not in self._file.keys():
255
- logger.warning("Can't read raw.var since raw layer doesn't exist!")
267
+ logger.debug("Can't read raw.var since raw layer doesn't exist!")
256
268
  return
257
269
 
258
270
  if len(self._file["raw"].keys()) == 0:
259
- logger.warning("The raw layer is empty!")
271
+ logger.debug("The raw layer is empty!")
260
272
  return
261
273
 
262
274
  self._raw = RawLayer(self._file)
@@ -366,37 +378,43 @@ class CapAnnData(BaseLayerMatrixAndDf):
366
378
  return list(self.obsm.keys())
367
379
 
368
380
  def obs_keys(self) -> List[str]:
369
- return self.obs.column_order.tolist()
381
+ return self.obs.column_order_array().tolist()
370
382
 
371
383
  def var_keys(self) -> List[str]:
372
- return self.var.column_order.tolist()
384
+ return self.var.column_order_array().tolist()
385
+
386
+ def field_to_entity(self, key):
387
+ if key == "obs":
388
+ return self.obs
389
+ elif key == "var":
390
+ return self.var
391
+ elif key == "raw.var":
392
+ return self.raw.var if self.raw is not None else None
393
+ elif key == "uns":
394
+ return self.uns
395
+ elif key == "layers":
396
+ return self.layers
397
+ elif key == "obsm":
398
+ return self.obsm
399
+ elif key == "varm":
400
+ return self.varm
401
+ elif key == "obsp":
402
+ return self.obsp
403
+ elif key == "varp":
404
+ return self.varp
405
+ else:
406
+ raise KeyError(
407
+ f"The field {key} is not supported! The list of supported fields are equal to {FIELDS_SUPPORTED_TO_OVERWRITE} "
408
+ f"attributes of the CapAnnData class."
409
+ )
373
410
 
374
411
  def overwrite(self, fields: List[str] = None, compression: str = "lzf") -> None:
375
- field_to_entity = {
376
- "obs": self.obs,
377
- "var": self.var,
378
- "raw.var": self.raw.var if self.raw is not None else None,
379
- "uns": self.uns,
380
- "layers": self.layers,
381
- "obsm": self.obsm,
382
- "varm": self.varm,
383
- "obsp": self.obsp,
384
- "varp": self.varp,
385
- }
386
-
387
412
  if fields is None:
388
- fields = list(field_to_entity.keys())
389
- else:
390
- for f in fields:
391
- if f not in field_to_entity.keys():
392
- raise KeyError(
393
- f"The field {f} is not supported! The list of supported fields are equal to supported "
394
- f"attributes of the CapAnnData class: obs, var, raw.var and uns."
395
- )
413
+ fields = FIELDS_SUPPORTED_TO_OVERWRITE
396
414
 
397
415
  for key in ["obs", "var", "raw.var"]:
398
416
  if key in fields:
399
- entity: CapAnnDataDF = field_to_entity[key]
417
+ entity: CapAnnDataDF = self.field_to_entity(key)
400
418
  if entity is None:
401
419
  continue
402
420
 
@@ -407,11 +425,22 @@ class CapAnnData(BaseLayerMatrixAndDf):
407
425
  f"{key}/{col}", entity[col].values, compression=compression
408
426
  )
409
427
 
410
- column_order = entity.column_order
428
+ column_order = entity.column_order_array()
411
429
  if (
412
430
  column_order.size == 0
413
431
  ): # Refs https://github.com/cellannotation/cap-anndata/issues/6
414
432
  column_order = np.array([], dtype=np.float64)
433
+
434
+ # Index update
435
+ index_name = entity.index.name
436
+ if not index_name:
437
+ index_name = "_index"
438
+ self._file[key].attrs["_index"] = index_name
439
+ index_col = self._read_attr(self._file[key], "_index")
440
+ self._write_elem(
441
+ f"{key}/{index_col}", entity.index.to_numpy(), compression=compression
442
+ )
443
+
415
444
  self._file[key].attrs["column-order"] = column_order
416
445
 
417
446
  if "uns" in fields:
@@ -424,7 +453,7 @@ class CapAnnData(BaseLayerMatrixAndDf):
424
453
 
425
454
  for field in ["layers", "obsm", "varm", "obsp", "varp"]:
426
455
  if field in fields:
427
- for key in field_to_entity[field].keys_to_remove:
456
+ for key in self.field_to_entity(field).keys_to_remove:
428
457
  del self._file[f"{field}/{key}"]
429
458
 
430
459
  def create_layer(
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: cap_anndata
3
- Version: 0.3.1
3
+ Version: 0.5.0
4
4
  Summary: Partial read/write of AnnData (h5ad) files for low-memory operations with large datasets.
5
5
  Home-page: https://github.com/cellannotation/cap-anndata
6
6
  Author: R. Mukhin, A. Isaev
@@ -20,9 +20,23 @@ Requires-Dist: anndata>=0.10.0
20
20
  Provides-Extra: dev
21
21
  Requires-Dist: pytest>=8.0.0; extra == "dev"
22
22
  Requires-Dist: setuptools~=69.1.1; extra == "dev"
23
+ Dynamic: author
24
+ Dynamic: author-email
25
+ Dynamic: classifier
26
+ Dynamic: description
27
+ Dynamic: description-content-type
28
+ Dynamic: home-page
29
+ Dynamic: license-file
30
+ Dynamic: project-url
31
+ Dynamic: provides-extra
32
+ Dynamic: requires-dist
33
+ Dynamic: requires-python
34
+ Dynamic: summary
23
35
 
24
36
  # CAP-AnnData: Partial I/O for AnnData (.h5ad) Files
25
37
 
38
+ [![PyPI version](https://img.shields.io/pypi/v/cap-anndata)](https://pypi.org/project/cap-anndata/) [![Build Status](https://github.com/cellannotation/cap-anndata/actions/workflows/python-app.yml/badge.svg)](https://github.com/cellannotation/cap-anndata/actions)
39
+
26
40
  ## Overview
27
41
  CAP-AnnData offering functionalities for selective reading and writing of [AnnData](https://pypi.org/project/anndata/)
28
42
  file fields without the need for loading entire dataset (or even entire field) into memory.
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name='cap_anndata',
5
- version='0.3.1',
5
+ version='0.5.0',
6
6
  author='R. Mukhin, A. Isaev',
7
7
  author_email='roman@ebookapplications.com',
8
8
  packages=find_packages(exclude=["test"]),
@@ -41,9 +41,8 @@ def test_remove_column():
41
41
 
42
42
  def test_from_df_class_method():
43
43
  data = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
44
- new_df = CapAnnDataDF.from_df(data, ["B", "A"])
45
-
46
- assert list(new_df.column_order) == ["B", "A"]
44
+ new_df = CapAnnDataDF.from_df(data)
45
+ assert list(new_df.column_order) == ["A", "B"]
47
46
 
48
47
 
49
48
  def test_column_order_integrity():
@@ -59,23 +58,22 @@ def test_column_order_integrity():
59
58
 
60
59
  def test_join():
61
60
  data1 = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
62
- data2 = pd.DataFrame({"D": [7, 8, 9], "E": [10, 11, 12]})
63
- cap_anndata_df1 = CapAnnDataDF.from_df(data1, column_order=["A", "B", "C"])
64
-
65
- cap_anndata_df1 = cap_anndata_df1.join(data2, how="left")
61
+ data2 = pd.DataFrame({"C": [7, 8, 9], "D": [10, 11, 12]})
62
+ cap_anndata_df1 = CapAnnDataDF.from_df(data1)
63
+ cap_anndata_df2 = cap_anndata_df1.join(data2, how="left")
66
64
 
67
- expected_order = ["A", "B", "C", "D", "E"]
68
- assert list(cap_anndata_df1.column_order) == expected_order
69
- assert cap_anndata_df1.shape == (3, 4)
65
+ expected_order = ["A", "B", "C", "D"]
66
+ assert list(cap_anndata_df2.column_order) == expected_order
67
+ assert cap_anndata_df2.shape == (3, 4)
70
68
 
71
69
 
72
70
  def test_merge():
73
71
  data1 = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
74
72
  data2 = pd.DataFrame({"A": [2, 3, 4], "D": [10, 11, 12]})
75
- cap_anndata_df1 = CapAnnDataDF.from_df(data1, column_order=["A", "B", "C"])
73
+ cap_anndata_df1 = CapAnnDataDF.from_df(data1)
76
74
 
77
75
  cap_anndata_df1 = cap_anndata_df1.merge(data2, how="inner", on="A")
78
76
 
79
- expected_order = ["A", "B", "C", "D"]
77
+ expected_order = ["A", "B", "D"]
80
78
  assert list(cap_anndata_df1.column_order) == expected_order
81
79
  assert cap_anndata_df1.shape == (2, 3)
@@ -103,6 +103,26 @@ def test_partial_read():
103
103
  pd.testing.assert_index_equal(adata.raw.var.index, cap_adata.raw.var.index)
104
104
 
105
105
 
106
+ def test_overwrite_dataframe_before_read_obs():
107
+ path = "tmp.h5ad"
108
+ x = np.ones((10, 10), dtype=np.float32)
109
+ adata = ad.AnnData(X=x)
110
+ adata.obs["columns"] = "value"
111
+ adata.write_h5ad(path)
112
+ del adata
113
+
114
+ with read_h5ad(path, True) as adata:
115
+ # https://github.com/cellannotation/cap-anndata/issues/33
116
+ adata.obs["new_column"] = "new_value"
117
+ adata.overwrite(["obs"])
118
+
119
+ with read_h5ad(path) as adata:
120
+ adata.read_obs("new_column")
121
+ assert (adata.obs["new_column"] == "new_value").all(), "Wrong values in column!"
122
+
123
+ os.remove(path)
124
+
125
+
106
126
  @pytest.mark.parametrize("compression", ["gzip", "lzf"])
107
127
  def test_overwrite_df(compression):
108
128
  adata = get_filled_anndata()
@@ -110,12 +130,17 @@ def test_overwrite_df(compression):
110
130
  file_path = os.path.join(temp_folder, "test_overwrite_df.h5ad")
111
131
  adata.write_h5ad(file_path)
112
132
 
133
+ new_obs_index = None
113
134
  with read_h5ad(file_path, edit=True) as cap_adata:
135
+ # Modify 'obs'
114
136
  cap_adata.read_obs(columns=["cell_type"])
115
137
  cap_adata.obs["cell_type"] = [
116
138
  f"new_cell_type_{i%2}" for i in range(cap_adata.shape[0])
117
139
  ]
118
140
  cap_adata.obs["const_str"] = "some string"
141
+ # Modify obs 'index'
142
+ new_obs_index = [s + "_new" for s in cap_adata.obs.index]
143
+ cap_adata.obs.index = new_obs_index
119
144
  ref_obs = cap_adata.obs.copy()
120
145
 
121
146
  # Modify 'var'
@@ -144,6 +169,7 @@ def test_overwrite_df(compression):
144
169
  pd.testing.assert_frame_equal(
145
170
  ref_obs, adata.obs[ref_obs.columns.to_list()], check_frame_type=False
146
171
  )
172
+ assert (adata.obs.index == new_obs_index).all(), "Index must be changed!"
147
173
 
148
174
  # Assert changes in 'var'
149
175
  assert all([c in adata.var.columns for c in ref_var.columns])
@@ -689,3 +715,121 @@ def test_modify_obsp_varp(field):
689
715
  assert len(getattr(cap_adata, field).keys()) == 0
690
716
 
691
717
  os.remove(file_path)
718
+
719
+
720
+ def test_main_var_layers():
721
+ var_index = [f"ind_{i}" for i in range(10)]
722
+ raw_var_index = [f"raw_ind_{i}" for i in range(10)]
723
+
724
+ x = np.eye(10, dtype=np.float32)
725
+ raw_x = x * 2
726
+ adata = ad.AnnData(X=raw_x)
727
+ adata.var.index = raw_var_index
728
+ adata.raw = adata
729
+ adata.X = x
730
+ adata.var.index = var_index
731
+
732
+ temp_folder = tempfile.mkdtemp()
733
+ file_path = os.path.join(temp_folder, "test_main_var_layers.h5ad")
734
+ adata.write_h5ad(file_path)
735
+
736
+ with read_h5ad(file_path) as cap_anndata:
737
+ assert cap_anndata.var.index.tolist() == var_index
738
+ assert cap_anndata.raw.var.index.tolist() == raw_var_index
739
+ assert np.allclose(cap_anndata.X[:], x)
740
+ assert np.allclose(cap_anndata.raw.X[:], raw_x)
741
+
742
+ os.remove(file_path)
743
+
744
+
745
+ @pytest.mark.parametrize("name", ["barcodes", "", None])
746
+ def test_modify_index(name):
747
+ adata = get_base_anndata()
748
+
749
+ temp_folder = tempfile.mkdtemp()
750
+ file_path = os.path.join(temp_folder, "test_main_var_layers.h5ad")
751
+ adata.write_h5ad(file_path)
752
+
753
+ with read_h5ad(file_path=file_path, edit=True) as cap_adata:
754
+ cap_adata.read_obs()
755
+ cap_adata.overwrite(["obs"])
756
+
757
+ cap_adata = ad.read_h5ad(file_path)
758
+ pd.testing.assert_frame_equal(
759
+ left=adata.obs,
760
+ right=cap_adata.obs,
761
+ check_dtype=True,
762
+ check_index_type=True,
763
+ check_names=True,
764
+ )
765
+
766
+ with read_h5ad(file_path=file_path, edit=True) as cap_adata:
767
+ cap_adata.read_obs()
768
+ cap_adata.obs.index = pd.Series(data=[f"cell_{i}" for i in range(cap_adata.shape[0])], name=name)
769
+ cap_adata.overwrite(["obs"])
770
+
771
+ with read_h5ad(file_path=file_path, edit=False) as cap_adata:
772
+ cap_adata.read_obs()
773
+ obs = cap_adata.obs
774
+
775
+ assert obs is not None, "DataFrame must be loaded!"
776
+ assert obs.index is not None, "DataFrame must have Index!"
777
+ if not name:
778
+ assert obs.index.name == None, "Index name must not be set!"
779
+ else:
780
+ assert obs.index.name == name, "Index name must be set!"
781
+ assert obs.index.to_list() == [f"cell_{i}" for i in range(cap_adata.shape[0])], "Wrong index values!"
782
+
783
+
784
+ def test_column_order_changes():
785
+ adata = get_base_anndata(n_rows = 3, n_genes = 2, sparse=False)
786
+
787
+ temp_folder = tempfile.mkdtemp()
788
+ file_path = os.path.join(temp_folder, "test_column_order.h5ad")
789
+ adata.write_h5ad(file_path)
790
+
791
+ data = {"A": [1, 2, 3], "B": [4, 5, 6]}
792
+ with read_h5ad(file_path=file_path, edit=True) as cap_adata:
793
+ df = pd.DataFrame(data)
794
+ cap_df = CapAnnDataDF.from_df(df)
795
+ cap_adata.obs = CapAnnDataDF.from_df(cap_df)
796
+ cap_adata.overwrite(["obs"])
797
+
798
+ new_column_order = list(data.keys())
799
+ new_column_order.reverse()
800
+ with read_h5ad(file_path=file_path, edit=True) as cap_adata:
801
+ cap_adata.read_obs()
802
+ df = cap_adata.obs[new_column_order] # change order via dataframe
803
+ cap_df = CapAnnDataDF.from_df(df)
804
+ cap_adata.obs = cap_df
805
+ cap_adata.overwrite(["obs"])
806
+
807
+ new_column_order.reverse()
808
+ with read_h5ad(file_path=file_path, edit=True) as cap_adata:
809
+ cap_adata.read_obs()
810
+ cap_df = cap_adata.obs
811
+ cap_df.column_order = new_column_order # change order via column_order
812
+ cap_adata.obs = cap_df
813
+ cap_adata.overwrite(["obs"])
814
+
815
+ with read_h5ad(file_path=file_path) as cap_adata:
816
+ cap_adata.read_obs()
817
+ assert list(cap_adata.obs.column_order) == new_column_order
818
+ assert list(cap_adata.obs.columns) == new_column_order
819
+
820
+
821
+ def test_df_in_obsm():
822
+ adata = get_base_anndata(n_rows = 3, n_genes = 2, sparse=False)
823
+ df = pd.DataFrame(index=adata.obs.index, data={"n": 1})
824
+ adata.obsm["df"] = df
825
+
826
+ temp_folder = tempfile.mkdtemp()
827
+ file_path = os.path.join(temp_folder, "test_df_in_obsm.h5ad")
828
+ adata.write_h5ad(file_path)
829
+
830
+ with read_h5ad(file_path=file_path, edit=False) as cap_adata:
831
+ assert cap_adata.obsm_keys() == ["df"]
832
+ cap_df = cap_adata.obsm["df"]
833
+ assert cap_df.shape == df.shape
834
+ assert cap_df.columns == df.columns
835
+ assert (cap_df["n"] == df["n"]).all()
@@ -1,69 +0,0 @@
1
- import pandas as pd
2
- import numpy as np
3
- from typing import List, Any, Union
4
- import logging
5
-
6
- from pandas._typing import Self
7
- from pandas.core.generic import bool_t
8
-
9
- logger = logging.getLogger(__name__)
10
-
11
-
12
- class CapAnnDataDF(pd.DataFrame):
13
- """
14
- The class to expand the pandas DataFrame behaviour to support partial
15
- reading and writing of AnnData obs and var (raw.var) fields.
16
- The main feature of the class is handling <column-order> attribute
17
- which must be a copy of h5py.Group attribute
18
- """
19
-
20
- _metadata = ["column_order"]
21
-
22
- def rename_column(self, old_name: str, new_name: str) -> None:
23
- i = np.where(self.column_order == old_name)[0]
24
- self.column_order[i] = new_name
25
- self.rename(columns={old_name: new_name}, inplace=True)
26
-
27
- def remove_column(self, col_name: str) -> None:
28
- i = np.where(self.column_order == col_name)[0]
29
- self.column_order = np.delete(self.column_order, i)
30
- self.drop(columns=[col_name], inplace=True)
31
-
32
- def __setitem__(self, key, value) -> None:
33
- if key not in self.column_order:
34
- self.column_order = np.append(self.column_order, key)
35
- return super().__setitem__(key, value)
36
-
37
- @classmethod
38
- def from_df(cls, df: pd.DataFrame, column_order: List[str] = None) -> Self:
39
- if column_order is None:
40
- column_order = df.columns.to_numpy()
41
-
42
- new_inst = cls(df)
43
- new_inst.column_order = column_order
44
- return new_inst
45
-
46
- def join(self, other: Any, **kwargs) -> Self:
47
- result = super().join(other=other, **kwargs)
48
- if isinstance(other, CapAnnDataDF):
49
- new_columns = [
50
- col for col in other.column_order if col not in self.column_order
51
- ]
52
- else:
53
- new_columns = [col for col in other.columns if col not in self.column_order]
54
- column_order = np.append(self.column_order, new_columns)
55
- return self.from_df(result, column_order=column_order)
56
-
57
- def merge(self, right, **kwargs) -> Self:
58
- result = super().merge(right=right, **kwargs)
59
- if isinstance(right, CapAnnDataDF):
60
- new_columns = [
61
- col for col in right.column_order if col not in self.column_order
62
- ]
63
- else:
64
- new_columns = [col for col in right.columns if col not in self.column_order]
65
- column_order = np.append(self.column_order, new_columns)
66
- return self.from_df(result, column_order=column_order)
67
-
68
- def copy(self, deep: Union[bool_t, None] = True) -> Self:
69
- return self.from_df(super().copy(deep=deep), column_order=self.column_order)
File without changes
File without changes