cap-anndata 0.3.1__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cap_anndata-0.3.1 → cap_anndata-0.5.0}/PKG-INFO +16 -2
- {cap_anndata-0.3.1 → cap_anndata-0.5.0}/README.md +2 -0
- cap_anndata-0.5.0/cap_anndata/backed_df.py +81 -0
- {cap_anndata-0.3.1 → cap_anndata-0.5.0}/cap_anndata/cap_anndata.py +77 -48
- {cap_anndata-0.3.1 → cap_anndata-0.5.0}/cap_anndata.egg-info/PKG-INFO +16 -2
- {cap_anndata-0.3.1 → cap_anndata-0.5.0}/setup.py +1 -1
- {cap_anndata-0.3.1 → cap_anndata-0.5.0}/test/test_backed_df.py +10 -12
- {cap_anndata-0.3.1 → cap_anndata-0.5.0}/test/test_cap_anndata.py +144 -0
- cap_anndata-0.3.1/cap_anndata/backed_df.py +0 -69
- {cap_anndata-0.3.1 → cap_anndata-0.5.0}/LICENSE +0 -0
- {cap_anndata-0.3.1 → cap_anndata-0.5.0}/cap_anndata/__init__.py +0 -0
- {cap_anndata-0.3.1 → cap_anndata-0.5.0}/cap_anndata/backed_dict.py +0 -0
- {cap_anndata-0.3.1 → cap_anndata-0.5.0}/cap_anndata/reader.py +0 -0
- {cap_anndata-0.3.1 → cap_anndata-0.5.0}/cap_anndata.egg-info/SOURCES.txt +0 -0
- {cap_anndata-0.3.1 → cap_anndata-0.5.0}/cap_anndata.egg-info/dependency_links.txt +0 -0
- {cap_anndata-0.3.1 → cap_anndata-0.5.0}/cap_anndata.egg-info/requires.txt +0 -0
- {cap_anndata-0.3.1 → cap_anndata-0.5.0}/cap_anndata.egg-info/top_level.txt +0 -0
- {cap_anndata-0.3.1 → cap_anndata-0.5.0}/setup.cfg +0 -0
- {cap_anndata-0.3.1 → cap_anndata-0.5.0}/test/test_backed_dict.py +0 -0
- {cap_anndata-0.3.1 → cap_anndata-0.5.0}/test/test_reader.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: cap_anndata
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.5.0
|
4
4
|
Summary: Partial read/write of AnnData (h5ad) files for low-memory operations with large datasets.
|
5
5
|
Home-page: https://github.com/cellannotation/cap-anndata
|
6
6
|
Author: R. Mukhin, A. Isaev
|
@@ -20,9 +20,23 @@ Requires-Dist: anndata>=0.10.0
|
|
20
20
|
Provides-Extra: dev
|
21
21
|
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
22
22
|
Requires-Dist: setuptools~=69.1.1; extra == "dev"
|
23
|
+
Dynamic: author
|
24
|
+
Dynamic: author-email
|
25
|
+
Dynamic: classifier
|
26
|
+
Dynamic: description
|
27
|
+
Dynamic: description-content-type
|
28
|
+
Dynamic: home-page
|
29
|
+
Dynamic: license-file
|
30
|
+
Dynamic: project-url
|
31
|
+
Dynamic: provides-extra
|
32
|
+
Dynamic: requires-dist
|
33
|
+
Dynamic: requires-python
|
34
|
+
Dynamic: summary
|
23
35
|
|
24
36
|
# CAP-AnnData: Partial I/O for AnnData (.h5ad) Files
|
25
37
|
|
38
|
+
[](https://pypi.org/project/cap-anndata/) [](https://github.com/cellannotation/cap-anndata/actions)
|
39
|
+
|
26
40
|
## Overview
|
27
41
|
CAP-AnnData offering functionalities for selective reading and writing of [AnnData](https://pypi.org/project/anndata/)
|
28
42
|
file fields without the need for loading entire dataset (or even entire field) into memory.
|
@@ -1,5 +1,7 @@
|
|
1
1
|
# CAP-AnnData: Partial I/O for AnnData (.h5ad) Files
|
2
2
|
|
3
|
+
[](https://pypi.org/project/cap-anndata/) [](https://github.com/cellannotation/cap-anndata/actions)
|
4
|
+
|
3
5
|
## Overview
|
4
6
|
CAP-AnnData offering functionalities for selective reading and writing of [AnnData](https://pypi.org/project/anndata/)
|
5
7
|
file fields without the need for loading entire dataset (or even entire field) into memory.
|
@@ -0,0 +1,81 @@
|
|
1
|
+
import pandas as pd
|
2
|
+
import numpy as np
|
3
|
+
from typing import List, Any, Union
|
4
|
+
|
5
|
+
from pandas._typing import Self
|
6
|
+
from pandas.core.generic import bool_t
|
7
|
+
|
8
|
+
|
9
|
+
class CapAnnDataDF(pd.DataFrame):
|
10
|
+
"""
|
11
|
+
The class to expand the pandas DataFrame behaviour to support partial
|
12
|
+
reading and writing of AnnData obs and var (raw.var) fields.
|
13
|
+
The main feature of the class is handling <column-order> attribute
|
14
|
+
which must be a copy of h5py.Group attribute
|
15
|
+
"""
|
16
|
+
|
17
|
+
_metadata = ["column_order"]
|
18
|
+
|
19
|
+
def column_order_array(self) -> np.array:
|
20
|
+
order = self.column_order
|
21
|
+
if order is not None and isinstance(order, List):
|
22
|
+
# Convert it to numpy array of str elements
|
23
|
+
return np.array(order, dtype=object)
|
24
|
+
else:
|
25
|
+
return order
|
26
|
+
|
27
|
+
def rename_column(self, old_name: str, new_name: str) -> None:
|
28
|
+
i = np.where(self.column_order_array() == old_name)[0]
|
29
|
+
tmp_array = self.column_order_array().copy()
|
30
|
+
tmp_array[i] = new_name
|
31
|
+
self.column_order = tmp_array.copy()
|
32
|
+
self.rename(columns={old_name: new_name}, inplace=True)
|
33
|
+
|
34
|
+
def remove_column(self, col_name: str) -> None:
|
35
|
+
i = np.where(self.column_order_array() == col_name)[0]
|
36
|
+
self.column_order = np.delete(self.column_order_array(), i)
|
37
|
+
self.drop(columns=[col_name], inplace=True)
|
38
|
+
|
39
|
+
def __setitem__(self, key, value) -> None:
|
40
|
+
if key not in self.column_order_array():
|
41
|
+
self.column_order = np.append(self.column_order_array(), key)
|
42
|
+
return super().__setitem__(key, value)
|
43
|
+
|
44
|
+
@classmethod
|
45
|
+
def from_df(cls, df: pd.DataFrame, column_order: Union[np.array, List[str], None] = None) -> Self:
|
46
|
+
if column_order is None:
|
47
|
+
column_order = df.columns.to_numpy()
|
48
|
+
elif isinstance(column_order, List):
|
49
|
+
column_order = np.array(column_order)
|
50
|
+
new_inst = cls(df)
|
51
|
+
new_inst.column_order = column_order
|
52
|
+
return new_inst
|
53
|
+
|
54
|
+
def join(self, other: Any, **kwargs) -> Self:
|
55
|
+
result = super().join(other=other, **kwargs)
|
56
|
+
if isinstance(other, CapAnnDataDF):
|
57
|
+
new_columns = [
|
58
|
+
col for col in other.column_order_array() if col not in self.column_order_array()
|
59
|
+
]
|
60
|
+
else:
|
61
|
+
new_columns = [col for col in other.columns if col not in self.column_order_array()]
|
62
|
+
column_order = np.append(self.column_order_array(), new_columns)
|
63
|
+
df = self.from_df(result, column_order=column_order)
|
64
|
+
return df
|
65
|
+
|
66
|
+
def merge(self, right, **kwargs) -> Self:
|
67
|
+
result = super().merge(right=right, **kwargs)
|
68
|
+
if isinstance(right, CapAnnDataDF):
|
69
|
+
new_columns = [
|
70
|
+
col for col in right.column_order_array() if col not in self.column_order_array()
|
71
|
+
]
|
72
|
+
else:
|
73
|
+
new_columns = [col for col in right.columns if col not in self.column_order_array()]
|
74
|
+
column_order = np.append(self.column_order_array(), new_columns)
|
75
|
+
df = self.from_df(result, column_order=column_order)
|
76
|
+
return df
|
77
|
+
|
78
|
+
def copy(self, deep: Union[bool_t, None] = True) -> Self:
|
79
|
+
column_order = self.column_order_array()
|
80
|
+
df = self.from_df(super().copy(deep=deep), column_order=column_order)
|
81
|
+
return df
|
@@ -7,19 +7,33 @@ import scipy.sparse as ss
|
|
7
7
|
from packaging import version
|
8
8
|
|
9
9
|
if version.parse(ad.__version__) < version.parse("0.11.0"):
|
10
|
-
from anndata.experimental import
|
10
|
+
from anndata.experimental import (
|
11
|
+
sparse_dataset,
|
12
|
+
read_elem,
|
13
|
+
write_elem,
|
14
|
+
CSRDataset,
|
15
|
+
CSCDataset,
|
16
|
+
)
|
11
17
|
else:
|
12
|
-
from anndata.io import
|
18
|
+
from anndata.io import (
|
19
|
+
sparse_dataset,
|
20
|
+
read_elem,
|
21
|
+
write_elem,
|
22
|
+
)
|
23
|
+
from anndata.abc import (
|
24
|
+
CSRDataset,
|
25
|
+
CSCDataset,
|
26
|
+
)
|
13
27
|
|
14
28
|
from cap_anndata import CapAnnDataDF, CapAnnDataDict
|
15
29
|
|
16
30
|
logger = logging.getLogger(__name__)
|
17
31
|
|
18
32
|
X_NOTATION = Union[
|
19
|
-
h5py.Dataset,
|
33
|
+
h5py.Dataset, CSRDataset, CSCDataset, None
|
20
34
|
]
|
21
35
|
ARRAY_MAPPING_NOTATION = CapAnnDataDict[str, X_NOTATION]
|
22
|
-
|
36
|
+
FIELDS_SUPPORTED_TO_OVERWRITE = ["obs", "var", "raw.var", "uns", "layers", "obsm", "varm", "obsp", "varp"]
|
23
37
|
NotLinkedObject: Final = "__NotLinkedObject"
|
24
38
|
|
25
39
|
|
@@ -57,15 +71,7 @@ class BaseLayerMatrixAndDf:
|
|
57
71
|
return shape
|
58
72
|
|
59
73
|
def _lazy_df_load(self, key: str) -> CapAnnDataDF:
|
60
|
-
|
61
|
-
attribute = self._path_to_content + key
|
62
|
-
column_order = self._read_attr(self._file[attribute], "column-order")
|
63
|
-
df.column_order = column_order
|
64
|
-
if df.column_order.dtype != object:
|
65
|
-
# empty DataFrame will have column_order as float64
|
66
|
-
# which leads to failure in overwrite method
|
67
|
-
df.column_order = df.column_order.astype(object)
|
68
|
-
return df
|
74
|
+
return self._read_df(key=key, columns=[])
|
69
75
|
|
70
76
|
@staticmethod
|
71
77
|
def _read_attr(obj: Union[h5py.Group, h5py.Dataset], attr_name: str) -> any:
|
@@ -93,8 +99,10 @@ class BaseLayerMatrixAndDf:
|
|
93
99
|
cols_to_read = [c for c in columns if c in column_order]
|
94
100
|
df = CapAnnDataDF()
|
95
101
|
df.column_order = column_order
|
102
|
+
|
96
103
|
index_col = self._read_attr(h5_group, "_index")
|
97
|
-
|
104
|
+
index = read_elem(h5_group[index_col])
|
105
|
+
df.index = index
|
98
106
|
|
99
107
|
for col in cols_to_read:
|
100
108
|
df[col] = read_elem(h5_group[col])
|
@@ -135,15 +143,19 @@ class BaseLayerMatrixAndDf:
|
|
135
143
|
if not isinstance(group, h5py.Group):
|
136
144
|
raise ValueError(f"The object {key} must be a group!")
|
137
145
|
|
138
|
-
for
|
139
|
-
|
140
|
-
if isinstance(
|
141
|
-
cap_dict[
|
142
|
-
elif isinstance(
|
143
|
-
|
146
|
+
for entity_name in group.keys():
|
147
|
+
entity = group[entity_name]
|
148
|
+
if isinstance(entity, h5py.Dataset):
|
149
|
+
cap_dict[entity_name] = entity
|
150
|
+
elif isinstance(entity, h5py.Group):
|
151
|
+
enc_type = dict(entity.attrs).get("encoding-type")
|
152
|
+
if enc_type == "dataframe":
|
153
|
+
cap_dict[entity_name] = self._read_df(key="/".join([key, entity_name]), columns=None)
|
154
|
+
elif enc_type in ["csc_matrix", "csr_matrix"]:
|
155
|
+
cap_dict[entity_name] = sparse_dataset(entity)
|
144
156
|
else:
|
145
157
|
raise ValueError(
|
146
|
-
f"Can't link array in {key} due to unsupported type of object: {type(
|
158
|
+
f"Can't link array in {key} due to unsupported type of object: {type(entity)}"
|
147
159
|
)
|
148
160
|
|
149
161
|
def _create_new_matrix(
|
@@ -252,11 +264,11 @@ class CapAnnData(BaseLayerMatrixAndDf):
|
|
252
264
|
def raw(self) -> RawLayer:
|
253
265
|
if self._raw is None:
|
254
266
|
if "raw" not in self._file.keys():
|
255
|
-
logger.
|
267
|
+
logger.debug("Can't read raw.var since raw layer doesn't exist!")
|
256
268
|
return
|
257
269
|
|
258
270
|
if len(self._file["raw"].keys()) == 0:
|
259
|
-
logger.
|
271
|
+
logger.debug("The raw layer is empty!")
|
260
272
|
return
|
261
273
|
|
262
274
|
self._raw = RawLayer(self._file)
|
@@ -366,37 +378,43 @@ class CapAnnData(BaseLayerMatrixAndDf):
|
|
366
378
|
return list(self.obsm.keys())
|
367
379
|
|
368
380
|
def obs_keys(self) -> List[str]:
|
369
|
-
return self.obs.
|
381
|
+
return self.obs.column_order_array().tolist()
|
370
382
|
|
371
383
|
def var_keys(self) -> List[str]:
|
372
|
-
return self.var.
|
384
|
+
return self.var.column_order_array().tolist()
|
385
|
+
|
386
|
+
def field_to_entity(self, key):
|
387
|
+
if key == "obs":
|
388
|
+
return self.obs
|
389
|
+
elif key == "var":
|
390
|
+
return self.var
|
391
|
+
elif key == "raw.var":
|
392
|
+
return self.raw.var if self.raw is not None else None
|
393
|
+
elif key == "uns":
|
394
|
+
return self.uns
|
395
|
+
elif key == "layers":
|
396
|
+
return self.layers
|
397
|
+
elif key == "obsm":
|
398
|
+
return self.obsm
|
399
|
+
elif key == "varm":
|
400
|
+
return self.varm
|
401
|
+
elif key == "obsp":
|
402
|
+
return self.obsp
|
403
|
+
elif key == "varp":
|
404
|
+
return self.varp
|
405
|
+
else:
|
406
|
+
raise KeyError(
|
407
|
+
f"The field {key} is not supported! The list of supported fields are equal to {FIELDS_SUPPORTED_TO_OVERWRITE} "
|
408
|
+
f"attributes of the CapAnnData class."
|
409
|
+
)
|
373
410
|
|
374
411
|
def overwrite(self, fields: List[str] = None, compression: str = "lzf") -> None:
|
375
|
-
field_to_entity = {
|
376
|
-
"obs": self.obs,
|
377
|
-
"var": self.var,
|
378
|
-
"raw.var": self.raw.var if self.raw is not None else None,
|
379
|
-
"uns": self.uns,
|
380
|
-
"layers": self.layers,
|
381
|
-
"obsm": self.obsm,
|
382
|
-
"varm": self.varm,
|
383
|
-
"obsp": self.obsp,
|
384
|
-
"varp": self.varp,
|
385
|
-
}
|
386
|
-
|
387
412
|
if fields is None:
|
388
|
-
fields =
|
389
|
-
else:
|
390
|
-
for f in fields:
|
391
|
-
if f not in field_to_entity.keys():
|
392
|
-
raise KeyError(
|
393
|
-
f"The field {f} is not supported! The list of supported fields are equal to supported "
|
394
|
-
f"attributes of the CapAnnData class: obs, var, raw.var and uns."
|
395
|
-
)
|
413
|
+
fields = FIELDS_SUPPORTED_TO_OVERWRITE
|
396
414
|
|
397
415
|
for key in ["obs", "var", "raw.var"]:
|
398
416
|
if key in fields:
|
399
|
-
entity: CapAnnDataDF = field_to_entity
|
417
|
+
entity: CapAnnDataDF = self.field_to_entity(key)
|
400
418
|
if entity is None:
|
401
419
|
continue
|
402
420
|
|
@@ -407,11 +425,22 @@ class CapAnnData(BaseLayerMatrixAndDf):
|
|
407
425
|
f"{key}/{col}", entity[col].values, compression=compression
|
408
426
|
)
|
409
427
|
|
410
|
-
column_order = entity.
|
428
|
+
column_order = entity.column_order_array()
|
411
429
|
if (
|
412
430
|
column_order.size == 0
|
413
431
|
): # Refs https://github.com/cellannotation/cap-anndata/issues/6
|
414
432
|
column_order = np.array([], dtype=np.float64)
|
433
|
+
|
434
|
+
# Index update
|
435
|
+
index_name = entity.index.name
|
436
|
+
if not index_name:
|
437
|
+
index_name = "_index"
|
438
|
+
self._file[key].attrs["_index"] = index_name
|
439
|
+
index_col = self._read_attr(self._file[key], "_index")
|
440
|
+
self._write_elem(
|
441
|
+
f"{key}/{index_col}", entity.index.to_numpy(), compression=compression
|
442
|
+
)
|
443
|
+
|
415
444
|
self._file[key].attrs["column-order"] = column_order
|
416
445
|
|
417
446
|
if "uns" in fields:
|
@@ -424,7 +453,7 @@ class CapAnnData(BaseLayerMatrixAndDf):
|
|
424
453
|
|
425
454
|
for field in ["layers", "obsm", "varm", "obsp", "varp"]:
|
426
455
|
if field in fields:
|
427
|
-
for key in field_to_entity
|
456
|
+
for key in self.field_to_entity(field).keys_to_remove:
|
428
457
|
del self._file[f"{field}/{key}"]
|
429
458
|
|
430
459
|
def create_layer(
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: cap_anndata
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.5.0
|
4
4
|
Summary: Partial read/write of AnnData (h5ad) files for low-memory operations with large datasets.
|
5
5
|
Home-page: https://github.com/cellannotation/cap-anndata
|
6
6
|
Author: R. Mukhin, A. Isaev
|
@@ -20,9 +20,23 @@ Requires-Dist: anndata>=0.10.0
|
|
20
20
|
Provides-Extra: dev
|
21
21
|
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
22
22
|
Requires-Dist: setuptools~=69.1.1; extra == "dev"
|
23
|
+
Dynamic: author
|
24
|
+
Dynamic: author-email
|
25
|
+
Dynamic: classifier
|
26
|
+
Dynamic: description
|
27
|
+
Dynamic: description-content-type
|
28
|
+
Dynamic: home-page
|
29
|
+
Dynamic: license-file
|
30
|
+
Dynamic: project-url
|
31
|
+
Dynamic: provides-extra
|
32
|
+
Dynamic: requires-dist
|
33
|
+
Dynamic: requires-python
|
34
|
+
Dynamic: summary
|
23
35
|
|
24
36
|
# CAP-AnnData: Partial I/O for AnnData (.h5ad) Files
|
25
37
|
|
38
|
+
[](https://pypi.org/project/cap-anndata/) [](https://github.com/cellannotation/cap-anndata/actions)
|
39
|
+
|
26
40
|
## Overview
|
27
41
|
CAP-AnnData offering functionalities for selective reading and writing of [AnnData](https://pypi.org/project/anndata/)
|
28
42
|
file fields without the need for loading entire dataset (or even entire field) into memory.
|
@@ -41,9 +41,8 @@ def test_remove_column():
|
|
41
41
|
|
42
42
|
def test_from_df_class_method():
|
43
43
|
data = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
|
44
|
-
new_df = CapAnnDataDF.from_df(data
|
45
|
-
|
46
|
-
assert list(new_df.column_order) == ["B", "A"]
|
44
|
+
new_df = CapAnnDataDF.from_df(data)
|
45
|
+
assert list(new_df.column_order) == ["A", "B"]
|
47
46
|
|
48
47
|
|
49
48
|
def test_column_order_integrity():
|
@@ -59,23 +58,22 @@ def test_column_order_integrity():
|
|
59
58
|
|
60
59
|
def test_join():
|
61
60
|
data1 = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
|
62
|
-
data2 = pd.DataFrame({"
|
63
|
-
cap_anndata_df1 = CapAnnDataDF.from_df(data1
|
64
|
-
|
65
|
-
cap_anndata_df1 = cap_anndata_df1.join(data2, how="left")
|
61
|
+
data2 = pd.DataFrame({"C": [7, 8, 9], "D": [10, 11, 12]})
|
62
|
+
cap_anndata_df1 = CapAnnDataDF.from_df(data1)
|
63
|
+
cap_anndata_df2 = cap_anndata_df1.join(data2, how="left")
|
66
64
|
|
67
|
-
expected_order = ["A", "B", "C", "D"
|
68
|
-
assert list(
|
69
|
-
assert
|
65
|
+
expected_order = ["A", "B", "C", "D"]
|
66
|
+
assert list(cap_anndata_df2.column_order) == expected_order
|
67
|
+
assert cap_anndata_df2.shape == (3, 4)
|
70
68
|
|
71
69
|
|
72
70
|
def test_merge():
|
73
71
|
data1 = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
|
74
72
|
data2 = pd.DataFrame({"A": [2, 3, 4], "D": [10, 11, 12]})
|
75
|
-
cap_anndata_df1 = CapAnnDataDF.from_df(data1
|
73
|
+
cap_anndata_df1 = CapAnnDataDF.from_df(data1)
|
76
74
|
|
77
75
|
cap_anndata_df1 = cap_anndata_df1.merge(data2, how="inner", on="A")
|
78
76
|
|
79
|
-
expected_order = ["A", "B", "
|
77
|
+
expected_order = ["A", "B", "D"]
|
80
78
|
assert list(cap_anndata_df1.column_order) == expected_order
|
81
79
|
assert cap_anndata_df1.shape == (2, 3)
|
@@ -103,6 +103,26 @@ def test_partial_read():
|
|
103
103
|
pd.testing.assert_index_equal(adata.raw.var.index, cap_adata.raw.var.index)
|
104
104
|
|
105
105
|
|
106
|
+
def test_overwrite_dataframe_before_read_obs():
|
107
|
+
path = "tmp.h5ad"
|
108
|
+
x = np.ones((10, 10), dtype=np.float32)
|
109
|
+
adata = ad.AnnData(X=x)
|
110
|
+
adata.obs["columns"] = "value"
|
111
|
+
adata.write_h5ad(path)
|
112
|
+
del adata
|
113
|
+
|
114
|
+
with read_h5ad(path, True) as adata:
|
115
|
+
# https://github.com/cellannotation/cap-anndata/issues/33
|
116
|
+
adata.obs["new_column"] = "new_value"
|
117
|
+
adata.overwrite(["obs"])
|
118
|
+
|
119
|
+
with read_h5ad(path) as adata:
|
120
|
+
adata.read_obs("new_column")
|
121
|
+
assert (adata.obs["new_column"] == "new_value").all(), "Wrong values in column!"
|
122
|
+
|
123
|
+
os.remove(path)
|
124
|
+
|
125
|
+
|
106
126
|
@pytest.mark.parametrize("compression", ["gzip", "lzf"])
|
107
127
|
def test_overwrite_df(compression):
|
108
128
|
adata = get_filled_anndata()
|
@@ -110,12 +130,17 @@ def test_overwrite_df(compression):
|
|
110
130
|
file_path = os.path.join(temp_folder, "test_overwrite_df.h5ad")
|
111
131
|
adata.write_h5ad(file_path)
|
112
132
|
|
133
|
+
new_obs_index = None
|
113
134
|
with read_h5ad(file_path, edit=True) as cap_adata:
|
135
|
+
# Modify 'obs'
|
114
136
|
cap_adata.read_obs(columns=["cell_type"])
|
115
137
|
cap_adata.obs["cell_type"] = [
|
116
138
|
f"new_cell_type_{i%2}" for i in range(cap_adata.shape[0])
|
117
139
|
]
|
118
140
|
cap_adata.obs["const_str"] = "some string"
|
141
|
+
# Modify obs 'index'
|
142
|
+
new_obs_index = [s + "_new" for s in cap_adata.obs.index]
|
143
|
+
cap_adata.obs.index = new_obs_index
|
119
144
|
ref_obs = cap_adata.obs.copy()
|
120
145
|
|
121
146
|
# Modify 'var'
|
@@ -144,6 +169,7 @@ def test_overwrite_df(compression):
|
|
144
169
|
pd.testing.assert_frame_equal(
|
145
170
|
ref_obs, adata.obs[ref_obs.columns.to_list()], check_frame_type=False
|
146
171
|
)
|
172
|
+
assert (adata.obs.index == new_obs_index).all(), "Index must be changed!"
|
147
173
|
|
148
174
|
# Assert changes in 'var'
|
149
175
|
assert all([c in adata.var.columns for c in ref_var.columns])
|
@@ -689,3 +715,121 @@ def test_modify_obsp_varp(field):
|
|
689
715
|
assert len(getattr(cap_adata, field).keys()) == 0
|
690
716
|
|
691
717
|
os.remove(file_path)
|
718
|
+
|
719
|
+
|
720
|
+
def test_main_var_layers():
|
721
|
+
var_index = [f"ind_{i}" for i in range(10)]
|
722
|
+
raw_var_index = [f"raw_ind_{i}" for i in range(10)]
|
723
|
+
|
724
|
+
x = np.eye(10, dtype=np.float32)
|
725
|
+
raw_x = x * 2
|
726
|
+
adata = ad.AnnData(X=raw_x)
|
727
|
+
adata.var.index = raw_var_index
|
728
|
+
adata.raw = adata
|
729
|
+
adata.X = x
|
730
|
+
adata.var.index = var_index
|
731
|
+
|
732
|
+
temp_folder = tempfile.mkdtemp()
|
733
|
+
file_path = os.path.join(temp_folder, "test_main_var_layers.h5ad")
|
734
|
+
adata.write_h5ad(file_path)
|
735
|
+
|
736
|
+
with read_h5ad(file_path) as cap_anndata:
|
737
|
+
assert cap_anndata.var.index.tolist() == var_index
|
738
|
+
assert cap_anndata.raw.var.index.tolist() == raw_var_index
|
739
|
+
assert np.allclose(cap_anndata.X[:], x)
|
740
|
+
assert np.allclose(cap_anndata.raw.X[:], raw_x)
|
741
|
+
|
742
|
+
os.remove(file_path)
|
743
|
+
|
744
|
+
|
745
|
+
@pytest.mark.parametrize("name", ["barcodes", "", None])
|
746
|
+
def test_modify_index(name):
|
747
|
+
adata = get_base_anndata()
|
748
|
+
|
749
|
+
temp_folder = tempfile.mkdtemp()
|
750
|
+
file_path = os.path.join(temp_folder, "test_main_var_layers.h5ad")
|
751
|
+
adata.write_h5ad(file_path)
|
752
|
+
|
753
|
+
with read_h5ad(file_path=file_path, edit=True) as cap_adata:
|
754
|
+
cap_adata.read_obs()
|
755
|
+
cap_adata.overwrite(["obs"])
|
756
|
+
|
757
|
+
cap_adata = ad.read_h5ad(file_path)
|
758
|
+
pd.testing.assert_frame_equal(
|
759
|
+
left=adata.obs,
|
760
|
+
right=cap_adata.obs,
|
761
|
+
check_dtype=True,
|
762
|
+
check_index_type=True,
|
763
|
+
check_names=True,
|
764
|
+
)
|
765
|
+
|
766
|
+
with read_h5ad(file_path=file_path, edit=True) as cap_adata:
|
767
|
+
cap_adata.read_obs()
|
768
|
+
cap_adata.obs.index = pd.Series(data=[f"cell_{i}" for i in range(cap_adata.shape[0])], name=name)
|
769
|
+
cap_adata.overwrite(["obs"])
|
770
|
+
|
771
|
+
with read_h5ad(file_path=file_path, edit=False) as cap_adata:
|
772
|
+
cap_adata.read_obs()
|
773
|
+
obs = cap_adata.obs
|
774
|
+
|
775
|
+
assert obs is not None, "DataFrame must be loaded!"
|
776
|
+
assert obs.index is not None, "DataFrame must have Index!"
|
777
|
+
if not name:
|
778
|
+
assert obs.index.name == None, "Index name must not be set!"
|
779
|
+
else:
|
780
|
+
assert obs.index.name == name, "Index name must be set!"
|
781
|
+
assert obs.index.to_list() == [f"cell_{i}" for i in range(cap_adata.shape[0])], "Wrong index values!"
|
782
|
+
|
783
|
+
|
784
|
+
def test_column_order_changes():
|
785
|
+
adata = get_base_anndata(n_rows = 3, n_genes = 2, sparse=False)
|
786
|
+
|
787
|
+
temp_folder = tempfile.mkdtemp()
|
788
|
+
file_path = os.path.join(temp_folder, "test_column_order.h5ad")
|
789
|
+
adata.write_h5ad(file_path)
|
790
|
+
|
791
|
+
data = {"A": [1, 2, 3], "B": [4, 5, 6]}
|
792
|
+
with read_h5ad(file_path=file_path, edit=True) as cap_adata:
|
793
|
+
df = pd.DataFrame(data)
|
794
|
+
cap_df = CapAnnDataDF.from_df(df)
|
795
|
+
cap_adata.obs = CapAnnDataDF.from_df(cap_df)
|
796
|
+
cap_adata.overwrite(["obs"])
|
797
|
+
|
798
|
+
new_column_order = list(data.keys())
|
799
|
+
new_column_order.reverse()
|
800
|
+
with read_h5ad(file_path=file_path, edit=True) as cap_adata:
|
801
|
+
cap_adata.read_obs()
|
802
|
+
df = cap_adata.obs[new_column_order] # change order via dataframe
|
803
|
+
cap_df = CapAnnDataDF.from_df(df)
|
804
|
+
cap_adata.obs = cap_df
|
805
|
+
cap_adata.overwrite(["obs"])
|
806
|
+
|
807
|
+
new_column_order.reverse()
|
808
|
+
with read_h5ad(file_path=file_path, edit=True) as cap_adata:
|
809
|
+
cap_adata.read_obs()
|
810
|
+
cap_df = cap_adata.obs
|
811
|
+
cap_df.column_order = new_column_order # change order via column_order
|
812
|
+
cap_adata.obs = cap_df
|
813
|
+
cap_adata.overwrite(["obs"])
|
814
|
+
|
815
|
+
with read_h5ad(file_path=file_path) as cap_adata:
|
816
|
+
cap_adata.read_obs()
|
817
|
+
assert list(cap_adata.obs.column_order) == new_column_order
|
818
|
+
assert list(cap_adata.obs.columns) == new_column_order
|
819
|
+
|
820
|
+
|
821
|
+
def test_df_in_obsm():
|
822
|
+
adata = get_base_anndata(n_rows = 3, n_genes = 2, sparse=False)
|
823
|
+
df = pd.DataFrame(index=adata.obs.index, data={"n": 1})
|
824
|
+
adata.obsm["df"] = df
|
825
|
+
|
826
|
+
temp_folder = tempfile.mkdtemp()
|
827
|
+
file_path = os.path.join(temp_folder, "test_df_in_obsm.h5ad")
|
828
|
+
adata.write_h5ad(file_path)
|
829
|
+
|
830
|
+
with read_h5ad(file_path=file_path, edit=False) as cap_adata:
|
831
|
+
assert cap_adata.obsm_keys() == ["df"]
|
832
|
+
cap_df = cap_adata.obsm["df"]
|
833
|
+
assert cap_df.shape == df.shape
|
834
|
+
assert cap_df.columns == df.columns
|
835
|
+
assert (cap_df["n"] == df["n"]).all()
|
@@ -1,69 +0,0 @@
|
|
1
|
-
import pandas as pd
|
2
|
-
import numpy as np
|
3
|
-
from typing import List, Any, Union
|
4
|
-
import logging
|
5
|
-
|
6
|
-
from pandas._typing import Self
|
7
|
-
from pandas.core.generic import bool_t
|
8
|
-
|
9
|
-
logger = logging.getLogger(__name__)
|
10
|
-
|
11
|
-
|
12
|
-
class CapAnnDataDF(pd.DataFrame):
|
13
|
-
"""
|
14
|
-
The class to expand the pandas DataFrame behaviour to support partial
|
15
|
-
reading and writing of AnnData obs and var (raw.var) fields.
|
16
|
-
The main feature of the class is handling <column-order> attribute
|
17
|
-
which must be a copy of h5py.Group attribute
|
18
|
-
"""
|
19
|
-
|
20
|
-
_metadata = ["column_order"]
|
21
|
-
|
22
|
-
def rename_column(self, old_name: str, new_name: str) -> None:
|
23
|
-
i = np.where(self.column_order == old_name)[0]
|
24
|
-
self.column_order[i] = new_name
|
25
|
-
self.rename(columns={old_name: new_name}, inplace=True)
|
26
|
-
|
27
|
-
def remove_column(self, col_name: str) -> None:
|
28
|
-
i = np.where(self.column_order == col_name)[0]
|
29
|
-
self.column_order = np.delete(self.column_order, i)
|
30
|
-
self.drop(columns=[col_name], inplace=True)
|
31
|
-
|
32
|
-
def __setitem__(self, key, value) -> None:
|
33
|
-
if key not in self.column_order:
|
34
|
-
self.column_order = np.append(self.column_order, key)
|
35
|
-
return super().__setitem__(key, value)
|
36
|
-
|
37
|
-
@classmethod
|
38
|
-
def from_df(cls, df: pd.DataFrame, column_order: List[str] = None) -> Self:
|
39
|
-
if column_order is None:
|
40
|
-
column_order = df.columns.to_numpy()
|
41
|
-
|
42
|
-
new_inst = cls(df)
|
43
|
-
new_inst.column_order = column_order
|
44
|
-
return new_inst
|
45
|
-
|
46
|
-
def join(self, other: Any, **kwargs) -> Self:
|
47
|
-
result = super().join(other=other, **kwargs)
|
48
|
-
if isinstance(other, CapAnnDataDF):
|
49
|
-
new_columns = [
|
50
|
-
col for col in other.column_order if col not in self.column_order
|
51
|
-
]
|
52
|
-
else:
|
53
|
-
new_columns = [col for col in other.columns if col not in self.column_order]
|
54
|
-
column_order = np.append(self.column_order, new_columns)
|
55
|
-
return self.from_df(result, column_order=column_order)
|
56
|
-
|
57
|
-
def merge(self, right, **kwargs) -> Self:
|
58
|
-
result = super().merge(right=right, **kwargs)
|
59
|
-
if isinstance(right, CapAnnDataDF):
|
60
|
-
new_columns = [
|
61
|
-
col for col in right.column_order if col not in self.column_order
|
62
|
-
]
|
63
|
-
else:
|
64
|
-
new_columns = [col for col in right.columns if col not in self.column_order]
|
65
|
-
column_order = np.append(self.column_order, new_columns)
|
66
|
-
return self.from_df(result, column_order=column_order)
|
67
|
-
|
68
|
-
def copy(self, deep: Union[bool_t, None] = True) -> Self:
|
69
|
-
return self.from_df(super().copy(deep=deep), column_order=self.column_order)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|