cap-anndata 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- cap_anndata/__init__.py +4 -0
- cap_anndata/backed_df.py +32 -3
- cap_anndata/cap_anndata.py +177 -114
- cap_anndata/reader.py +44 -0
- {cap_anndata-0.1.1.dist-info → cap_anndata-0.2.1.dist-info}/METADATA +84 -30
- cap_anndata-0.2.1.dist-info/RECORD +10 -0
- cap_anndata-0.1.1.dist-info/RECORD +0 -9
- {cap_anndata-0.1.1.dist-info → cap_anndata-0.2.1.dist-info}/LICENSE +0 -0
- {cap_anndata-0.1.1.dist-info → cap_anndata-0.2.1.dist-info}/WHEEL +0 -0
- {cap_anndata-0.1.1.dist-info → cap_anndata-0.2.1.dist-info}/top_level.txt +0 -0
cap_anndata/__init__.py
CHANGED
cap_anndata/backed_df.py
CHANGED
@@ -1,8 +1,11 @@
|
|
1
1
|
import pandas as pd
|
2
2
|
import numpy as np
|
3
|
-
from typing import List
|
3
|
+
from typing import List, Any, Union
|
4
4
|
import logging
|
5
5
|
|
6
|
+
from pandas._typing import Self
|
7
|
+
from pandas.core.generic import bool_t
|
8
|
+
|
6
9
|
logger = logging.getLogger(__name__)
|
7
10
|
|
8
11
|
|
@@ -13,7 +16,8 @@ class CapAnnDataDF(pd.DataFrame):
|
|
13
16
|
The main feature of the class is handling <column-order> attribute
|
14
17
|
which must be a copy of h5py.Group attribute
|
15
18
|
"""
|
16
|
-
|
19
|
+
|
20
|
+
_metadata = ["column_order"]
|
17
21
|
|
18
22
|
def rename_column(self, old_name: str, new_name: str) -> None:
|
19
23
|
i = np.where(self.column_order == old_name)[0]
|
@@ -31,10 +35,35 @@ class CapAnnDataDF(pd.DataFrame):
|
|
31
35
|
return super().__setitem__(key, value)
|
32
36
|
|
33
37
|
@classmethod
|
34
|
-
def from_df(cls, df: pd.DataFrame, column_order: List[str] = None):
|
38
|
+
def from_df(cls, df: pd.DataFrame, column_order: List[str] = None) -> Self:
|
35
39
|
if column_order is None:
|
36
40
|
column_order = df.columns.to_numpy()
|
37
41
|
|
38
42
|
new_inst = cls(df)
|
39
43
|
new_inst.column_order = column_order
|
40
44
|
return new_inst
|
45
|
+
|
46
|
+
def join(self, other: Any, **kwargs) -> Self:
|
47
|
+
result = super().join(other=other, **kwargs)
|
48
|
+
if isinstance(other, CapAnnDataDF):
|
49
|
+
new_columns = [
|
50
|
+
col for col in other.column_order if col not in self.column_order
|
51
|
+
]
|
52
|
+
else:
|
53
|
+
new_columns = [col for col in other.columns if col not in self.column_order]
|
54
|
+
column_order = np.append(self.column_order, new_columns)
|
55
|
+
return self.from_df(result, column_order=column_order)
|
56
|
+
|
57
|
+
def merge(self, right, **kwargs) -> Self:
|
58
|
+
result = super().merge(right=right, **kwargs)
|
59
|
+
if isinstance(right, CapAnnDataDF):
|
60
|
+
new_columns = [
|
61
|
+
col for col in right.column_order if col not in self.column_order
|
62
|
+
]
|
63
|
+
else:
|
64
|
+
new_columns = [col for col in right.columns if col not in self.column_order]
|
65
|
+
column_order = np.append(self.column_order, new_columns)
|
66
|
+
return self.from_df(result, column_order=column_order)
|
67
|
+
|
68
|
+
def copy(self, deep: Union[bool_t, None] = True) -> Self:
|
69
|
+
return self.from_df(super().copy(deep=deep), column_order=self.column_order)
|
cap_anndata/cap_anndata.py
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
import logging
|
2
|
-
import contextlib
|
3
2
|
import anndata as ad
|
3
|
+
import numpy as np
|
4
4
|
import h5py
|
5
5
|
from typing import List, Union, Dict, Tuple, Final
|
6
6
|
from anndata._io.specs import read_elem, write_elem
|
7
|
-
from dataclasses import dataclass
|
8
7
|
|
9
8
|
from cap_anndata import CapAnnDataDF, CapAnnDataUns
|
10
9
|
|
10
|
+
|
11
11
|
logger = logging.getLogger(__name__)
|
12
12
|
|
13
13
|
X_NOTATION = Union[h5py.Dataset, ad.experimental.CSRDataset, ad.experimental.CSCDataset]
|
@@ -16,26 +16,11 @@ OBSM_NOTATION = Dict[str, X_NOTATION]
|
|
16
16
|
NotLinkedObject: Final = "__NotLinkedObject"
|
17
17
|
|
18
18
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
@property
|
25
|
-
def shape(self) -> Tuple[int, int]:
|
26
|
-
return self.X.shape if self.X is not None else None
|
27
|
-
|
28
|
-
|
29
|
-
class CapAnnData:
|
30
|
-
def __init__(self, h5_file: h5py.File) -> None:
|
31
|
-
self._file: h5py.File = h5_file
|
32
|
-
self.obs: CapAnnDataDF = None
|
33
|
-
self.var: CapAnnDataDF = None
|
19
|
+
class BaseLayerMatrixAndDf:
|
20
|
+
def __init__(self, file: h5py.File, path_to_content: str = "/") -> None:
|
21
|
+
self._file = file
|
22
|
+
self._path_to_content = path_to_content
|
34
23
|
self._X: X_NOTATION = None
|
35
|
-
self._obsm: OBSM_NOTATION = None
|
36
|
-
self._uns: CapAnnDataUns = None
|
37
|
-
self._raw: RawLayer = None
|
38
|
-
self._shape: Tuple[int, int] = None
|
39
24
|
|
40
25
|
@property
|
41
26
|
def X(self) -> X_NOTATION:
|
@@ -43,45 +28,48 @@ class CapAnnData:
|
|
43
28
|
self._link_x()
|
44
29
|
return self._X
|
45
30
|
|
46
|
-
|
47
|
-
|
48
|
-
if
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
if self._raw is None:
|
55
|
-
self._link_raw_x()
|
56
|
-
return self._raw
|
31
|
+
def _link_x(self) -> None:
|
32
|
+
x = self._file[self._path_to_content + "X"]
|
33
|
+
if isinstance(x, h5py.Dataset):
|
34
|
+
# dense X
|
35
|
+
self._X = x
|
36
|
+
else:
|
37
|
+
# sparse dataset
|
38
|
+
self._X = ad.experimental.sparse_dataset(x)
|
57
39
|
|
58
40
|
@property
|
59
|
-
def
|
60
|
-
if self.
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
41
|
+
def shape(self) -> Tuple[int, int]:
|
42
|
+
if self.X is not None:
|
43
|
+
shape = tuple(map(int, self.X.shape))
|
44
|
+
else:
|
45
|
+
shape = None
|
46
|
+
return shape
|
47
|
+
|
48
|
+
def _lazy_df_load(self, key: str) -> CapAnnDataDF:
|
49
|
+
df = CapAnnDataDF()
|
50
|
+
attribute = self._path_to_content + key
|
51
|
+
column_order = self._read_attr(self._file[attribute], "column-order")
|
52
|
+
df.column_order = column_order
|
53
|
+
if df.column_order.dtype != object:
|
54
|
+
# empty DataFrame will have column_order as float64
|
55
|
+
# which leads to failure in overwrite method
|
56
|
+
df.column_order = df.column_order.astype(object)
|
57
|
+
return df
|
66
58
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
59
|
+
@staticmethod
|
60
|
+
def _read_attr(obj: Union[h5py.Group, h5py.Dataset], attr_name: str) -> any:
|
61
|
+
attrs = dict(obj.attrs)
|
62
|
+
if attr_name not in attrs.keys():
|
63
|
+
raise KeyError(f"The {attr_name} doesn't exist!")
|
64
|
+
return attrs[attr_name]
|
73
65
|
|
74
|
-
|
75
|
-
|
76
|
-
|
66
|
+
def _read_df(self, key: str, columns: List[str]) -> CapAnnDataDF:
|
67
|
+
group_path = self._path_to_content + key
|
68
|
+
if group_path not in self._file.keys():
|
69
|
+
raise ValueError(f"The group {group_path} doesn't exist in the file!")
|
77
70
|
|
78
|
-
|
79
|
-
self._raw.var = self._read_df(self._file[key], columns=columns)
|
80
|
-
else:
|
81
|
-
key = "var"
|
82
|
-
self.var = self._read_df(self._file[key], columns=columns)
|
71
|
+
h5_group = self._file[group_path]
|
83
72
|
|
84
|
-
def _read_df(self, h5_group: h5py.Group, columns: List[str]) -> CapAnnDataDF:
|
85
73
|
column_order = self._read_attr(h5_group, "column-order")
|
86
74
|
|
87
75
|
if columns is None:
|
@@ -91,31 +79,143 @@ class CapAnnData:
|
|
91
79
|
cols_to_read = [c for c in columns if c in column_order]
|
92
80
|
df = CapAnnDataDF()
|
93
81
|
df.column_order = column_order
|
94
|
-
|
95
82
|
index_col = self._read_attr(h5_group, "_index")
|
96
83
|
df.index = read_elem(h5_group[index_col])
|
97
84
|
|
98
85
|
for col in cols_to_read:
|
99
86
|
df[col] = read_elem(h5_group[col])
|
87
|
+
|
100
88
|
if df.column_order.dtype != object:
|
101
89
|
# empty DataFrame will have column_order as float64
|
102
90
|
# which leads to failure in overwrite method
|
103
91
|
df.column_order = df.column_order.astype(object)
|
104
92
|
return df
|
105
93
|
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
94
|
+
def _write_elem_lzf(self, dest_key: str, elem: any) -> None:
|
95
|
+
write_elem(self._file, dest_key, elem, dataset_kwargs={"compression": "lzf"})
|
96
|
+
|
97
|
+
def _validate_cap_df(self, cap_df: CapAnnDataDF, axis: int) -> None:
|
98
|
+
if not isinstance(cap_df, CapAnnDataDF):
|
99
|
+
raise TypeError(
|
100
|
+
f"The input should be an instance of CapAnnDataDF class but {type(cap_df)} given!"
|
101
|
+
)
|
102
|
+
|
103
|
+
if axis not in [0, 1]:
|
104
|
+
raise ValueError("The axis should be either 0 or 1!")
|
105
|
+
|
106
|
+
if cap_df.shape[0] != self.shape[axis]:
|
107
|
+
items = "cells" if axis == 0 else "genes"
|
108
|
+
raise ValueError(
|
109
|
+
f"The number of rows in the input DataFrame should be equal to the number of {items} in the "
|
110
|
+
"AnnData object!"
|
111
|
+
)
|
112
|
+
|
113
|
+
|
114
|
+
class RawLayer(BaseLayerMatrixAndDf):
|
115
|
+
def __init__(self, h5_file: h5py.File):
|
116
|
+
super().__init__(h5_file, path_to_content="/raw/")
|
117
|
+
self._var: CapAnnDataDF = None
|
118
|
+
|
119
|
+
@property
|
120
|
+
def var(self) -> CapAnnDataDF:
|
121
|
+
if self._var is None:
|
122
|
+
self._var = self._lazy_df_load("var")
|
123
|
+
return self._var
|
124
|
+
|
125
|
+
@var.setter
|
126
|
+
def var(self, cap_df: CapAnnDataDF) -> None:
|
127
|
+
self._validate_cap_df(cap_df, axis=1)
|
128
|
+
self._var = cap_df
|
129
|
+
|
130
|
+
def read_var(self, columns: List[str] = None, reset: bool = False) -> None:
|
131
|
+
df = self._read_df(key="var", columns=columns)
|
132
|
+
if self.var.empty or reset:
|
133
|
+
self._var = df
|
134
|
+
else:
|
135
|
+
for col in df.columns:
|
136
|
+
self._var[col] = df[col]
|
137
|
+
|
138
|
+
|
139
|
+
class CapAnnData(BaseLayerMatrixAndDf):
|
140
|
+
def __init__(self, h5_file: h5py.File) -> None:
|
141
|
+
super().__init__(h5_file, path_to_content="/")
|
142
|
+
self._file: h5py.File = h5_file
|
143
|
+
self._obs: CapAnnDataDF = None
|
144
|
+
self._var: CapAnnDataDF = None
|
145
|
+
self._X: X_NOTATION = None
|
146
|
+
self._obsm: OBSM_NOTATION = None
|
147
|
+
self._uns: CapAnnDataUns = None
|
148
|
+
self._raw: RawLayer = None
|
149
|
+
self._shape: Tuple[int, int] = None
|
150
|
+
|
151
|
+
@property
|
152
|
+
def obs(self) -> CapAnnDataDF:
|
153
|
+
if self._obs is None:
|
154
|
+
self._obs = self._lazy_df_load("obs")
|
155
|
+
return self._obs
|
156
|
+
|
157
|
+
@obs.setter
|
158
|
+
def obs(self, cap_df: CapAnnDataDF) -> None:
|
159
|
+
self._validate_cap_df(cap_df, axis=0)
|
160
|
+
self._obs = cap_df
|
161
|
+
|
162
|
+
@property
|
163
|
+
def var(self) -> CapAnnDataDF:
|
164
|
+
if self._var is None:
|
165
|
+
self._var = self._lazy_df_load("var")
|
166
|
+
return self._var
|
167
|
+
|
168
|
+
@var.setter
|
169
|
+
def var(self, cap_df: CapAnnDataDF) -> None:
|
170
|
+
self._validate_cap_df(cap_df, axis=1)
|
171
|
+
self._var = cap_df
|
172
|
+
|
173
|
+
@property
|
174
|
+
def obsm(self) -> OBSM_NOTATION:
|
175
|
+
if self._obsm is None:
|
176
|
+
self._link_obsm()
|
177
|
+
return self._obsm
|
178
|
+
|
179
|
+
@property
|
180
|
+
def raw(self) -> RawLayer:
|
181
|
+
if self._raw is None:
|
182
|
+
if "raw" not in self._file.keys():
|
183
|
+
logger.warning("Can't read raw.var since raw layer doesn't exist!")
|
184
|
+
return
|
185
|
+
|
186
|
+
self._raw = RawLayer(self._file)
|
187
|
+
return self._raw
|
188
|
+
|
189
|
+
@property
|
190
|
+
def uns(self) -> CapAnnDataUns:
|
191
|
+
if self._uns is None:
|
192
|
+
self._uns = CapAnnDataUns(
|
193
|
+
{k: NotLinkedObject for k in self._file["uns"].keys()}
|
194
|
+
)
|
195
|
+
return self._uns
|
196
|
+
|
197
|
+
def read_obs(self, columns: List[str] = None, reset: bool = False) -> None:
|
198
|
+
df = self._read_df("obs", columns=columns)
|
199
|
+
if self.obs.empty or reset:
|
200
|
+
self._obs = df
|
201
|
+
else:
|
202
|
+
for col in df.columns:
|
203
|
+
self._obs[col] = df[col]
|
204
|
+
|
205
|
+
def read_var(self, columns: List[str] = None, reset: bool = False) -> None:
|
206
|
+
df = self._read_df("var", columns=columns)
|
207
|
+
if self.var.empty or reset:
|
208
|
+
self._var = df
|
209
|
+
else:
|
210
|
+
for col in df.columns:
|
211
|
+
self._var[col] = df[col]
|
112
212
|
|
113
213
|
def overwrite(self, fields: List[str] = None) -> None:
|
114
214
|
field_to_entity = {
|
115
215
|
"obs": self.obs,
|
116
216
|
"var": self.var,
|
117
217
|
"raw.var": self.raw.var if self.raw is not None else None,
|
118
|
-
"uns": self.uns
|
218
|
+
"uns": self.uns,
|
119
219
|
}
|
120
220
|
|
121
221
|
if fields is None:
|
@@ -124,7 +224,9 @@ class CapAnnData:
|
|
124
224
|
for f in fields:
|
125
225
|
if f not in field_to_entity.keys():
|
126
226
|
raise KeyError(
|
127
|
-
f"The field {f} is not supported! The list of
|
227
|
+
f"The field {f} is not supported! The list of supported fields are equal to supported "
|
228
|
+
f"attributes of the CapAnnData class: obs, var, raw.var and uns."
|
229
|
+
)
|
128
230
|
|
129
231
|
for key in ["obs", "var", "raw.var"]:
|
130
232
|
if key in fields:
|
@@ -132,11 +234,17 @@ class CapAnnData:
|
|
132
234
|
if entity is None:
|
133
235
|
continue
|
134
236
|
|
135
|
-
key = key.replace(".",
|
237
|
+
key = key.replace(".", "/") if key == "raw.var" else key
|
136
238
|
|
137
239
|
for col in entity.columns:
|
138
240
|
self._write_elem_lzf(f"{key}/{col}", entity[col].values)
|
139
|
-
|
241
|
+
|
242
|
+
column_order = entity.column_order
|
243
|
+
if (
|
244
|
+
column_order.size == 0
|
245
|
+
): # Refs https://github.com/cellannotation/cap-anndata/issues/6
|
246
|
+
column_order = np.array([], dtype=np.float64)
|
247
|
+
self._file[key].attrs["column-order"] = column_order
|
140
248
|
|
141
249
|
if "uns" in fields:
|
142
250
|
for key in self.uns.keys():
|
@@ -156,32 +264,6 @@ class CapAnnData:
|
|
156
264
|
sourse = self._file[f"uns/{key}"]
|
157
265
|
self.uns[key] = read_elem(sourse)
|
158
266
|
|
159
|
-
@property
|
160
|
-
def shape(self) -> tuple[int, int]:
|
161
|
-
return self.X.shape
|
162
|
-
|
163
|
-
def _link_x(self) -> None:
|
164
|
-
x = self._file["X"]
|
165
|
-
if isinstance(x, h5py.Dataset):
|
166
|
-
# dense X
|
167
|
-
self._X = x
|
168
|
-
else:
|
169
|
-
# sparse dataset
|
170
|
-
self._X = ad.experimental.sparse_dataset(x)
|
171
|
-
|
172
|
-
def _link_raw_x(self) -> None:
|
173
|
-
if "raw" in self._file.keys():
|
174
|
-
if self._raw is None:
|
175
|
-
self._raw = RawLayer()
|
176
|
-
|
177
|
-
raw_x = self._file["raw/X"]
|
178
|
-
if isinstance(raw_x, h5py.Dataset):
|
179
|
-
# dense X
|
180
|
-
self._raw.X = raw_x
|
181
|
-
else:
|
182
|
-
# sparse dataset
|
183
|
-
self._raw.X = ad.experimental.sparse_dataset(raw_x)
|
184
|
-
|
185
267
|
def _link_obsm(self) -> None:
|
186
268
|
self._obsm = {}
|
187
269
|
if "obsm" in self._file.keys():
|
@@ -198,27 +280,8 @@ class CapAnnData:
|
|
198
280
|
def obsm_keys(self) -> List[str]:
|
199
281
|
return list(self.obsm.keys())
|
200
282
|
|
201
|
-
def
|
202
|
-
|
283
|
+
def obs_keys(self) -> List[str]:
|
284
|
+
return self.obs.column_order.tolist()
|
203
285
|
|
204
|
-
|
205
|
-
|
206
|
-
def read_anndata_file(file_path, backed='r'):
|
207
|
-
"""The method to read anndata file using original AnnData package"""
|
208
|
-
logger.debug(f"Read file {file_path} in backed mode = {backed}...")
|
209
|
-
|
210
|
-
adata = None
|
211
|
-
try:
|
212
|
-
adata = ad.read_h5ad(file_path, backed=backed)
|
213
|
-
logger.debug(f"Successfully read anndata file path {file_path}")
|
214
|
-
yield adata
|
215
|
-
|
216
|
-
except Exception as error:
|
217
|
-
logger.error(f"Error during read anndata file at path: {file_path}, error = {error}!")
|
218
|
-
raise error
|
219
|
-
|
220
|
-
finally:
|
221
|
-
if adata is not None:
|
222
|
-
if adata.isbacked:
|
223
|
-
adata.file.close()
|
224
|
-
logger.debug("AnnData closed!")
|
286
|
+
def var_keys(self) -> List[str]:
|
287
|
+
return self.var.column_order.tolist()
|
cap_anndata/reader.py
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
import logging
|
2
|
+
import contextlib
|
3
|
+
import h5py
|
4
|
+
|
5
|
+
from cap_anndata import CapAnnData
|
6
|
+
|
7
|
+
|
8
|
+
logger = logging.getLogger(__name__)
|
9
|
+
|
10
|
+
|
11
|
+
@contextlib.contextmanager
|
12
|
+
def read_h5ad(file_path: str, edit: bool = False):
|
13
|
+
"""
|
14
|
+
This is the main read method for CapAnnData.
|
15
|
+
Must be used in 'with' context.
|
16
|
+
"""
|
17
|
+
mode = "r+" if edit else "r"
|
18
|
+
logger.debug(f"Read file {file_path} mode={mode} in context...")
|
19
|
+
|
20
|
+
try:
|
21
|
+
file = h5py.File(file_path, mode)
|
22
|
+
cap_adata = CapAnnData(file)
|
23
|
+
logger.debug(f"Successfully read anndata file path {file_path}")
|
24
|
+
yield cap_adata
|
25
|
+
|
26
|
+
except Exception as error:
|
27
|
+
logger.error(f"Error during read anndata file at path: {file_path}, error = {error}!")
|
28
|
+
raise error
|
29
|
+
|
30
|
+
finally:
|
31
|
+
file.close()
|
32
|
+
logger.debug("AnnData closed!")
|
33
|
+
|
34
|
+
|
35
|
+
def read_directly(file_path: str, edit: bool = False) -> CapAnnData:
|
36
|
+
"""
|
37
|
+
Must be used only in specific cases.
|
38
|
+
User is responsible to close the h5py file when the work with CapAnnData instance done.
|
39
|
+
"""
|
40
|
+
mode = "r+" if edit else "r"
|
41
|
+
logger.debug(f"Read file {file_path} mode={mode} directly...")
|
42
|
+
file = h5py.File(file_path, mode)
|
43
|
+
cap_adata = CapAnnData(file)
|
44
|
+
return cap_adata
|
@@ -1,7 +1,7 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: cap_anndata
|
3
|
-
Version: 0.
|
4
|
-
Summary: Partial read of AnnData files for low-memory operations with large datasets.
|
3
|
+
Version: 0.2.1
|
4
|
+
Summary: Partial read/write of AnnData (h5ad) files for low-memory operations with large datasets.
|
5
5
|
Home-page: https://github.com/cellannotation/cap-anndata
|
6
6
|
Author: R. Mukhin, A. Isaev
|
7
7
|
Author-email: roman@ebookapplications.com
|
@@ -15,8 +15,10 @@ License-File: LICENSE
|
|
15
15
|
Requires-Dist: numpy >=1.26.3
|
16
16
|
Requires-Dist: pandas >=2.2.0
|
17
17
|
Requires-Dist: anndata >=0.10.5
|
18
|
+
Requires-Dist: h5py >=3.5.0
|
18
19
|
Provides-Extra: dev
|
19
20
|
Requires-Dist: pytest >=8.0.0 ; extra == 'dev'
|
21
|
+
Requires-Dist: setuptools ~=69.1.1 ; extra == 'dev'
|
20
22
|
|
21
23
|
# CAP-AnnData: Enhanced Partial I/O for AnnData Files
|
22
24
|
|
@@ -25,41 +27,65 @@ CAP-AnnData enriches the AnnData ecosystem by offering tailored functionalities
|
|
25
27
|
|
26
28
|
## Getting Started
|
27
29
|
|
30
|
+
### Installation
|
31
|
+
Install CAP-AnnData via pip:
|
32
|
+
|
33
|
+
```commandline
|
34
|
+
pip install -U cap-anndata
|
35
|
+
```
|
36
|
+
|
28
37
|
### Running Tests
|
29
|
-
Ensure the integrity and reliability of CAP-AnnData on your system by running the unit tests
|
38
|
+
Ensure the integrity and reliability of CAP-AnnData on your system by running the unit tests via `pytest` from the root of the repo.
|
39
|
+
|
40
|
+
```commandline
|
41
|
+
pip install pytest
|
42
|
+
pytest test
|
43
|
+
```
|
30
44
|
|
31
45
|
Make sure Python 3.9 or newer is used, along with all requirements specified in requirements.txt
|
32
46
|
|
33
47
|
## How-TO:
|
34
48
|
|
35
|
-
#### 1.
|
49
|
+
#### 1. Access AnnData File DataFrames
|
36
50
|
|
37
51
|
##### Basic Reading
|
38
52
|
By default, `CapAnnData` does not automatically read any data. To begin working with dataframes, you need to explicitly read the data from the AnnData file. You can read the entire dataframe or select specific columns. For partial reading, provide a list of column names.
|
39
53
|
|
40
54
|
```python
|
41
|
-
import
|
42
|
-
from cap_anndata import CapAnnData
|
55
|
+
from cap_anndata import read_h5ad
|
43
56
|
|
44
57
|
file_path = "your_data.h5ad"
|
45
|
-
with
|
46
|
-
|
47
|
-
|
58
|
+
with read_h5ad(file_path=file_path, edit=False) as cap_adata:
|
59
|
+
# Get the list of all obs columns in AnnData file
|
60
|
+
cap_adata.obs_keys() # ['a', 'b', 'c']
|
48
61
|
# Read all columns of 'obs'
|
49
62
|
cap_adata.read_obs()
|
63
|
+
# Get the list of columns of DataFrame in memory
|
64
|
+
cap_adata.obs.columns # ['a', 'b', 'c']
|
50
65
|
|
66
|
+
# Get the list of all var columns in AnnData file
|
67
|
+
cap_adata.var_keys() # ['d', 'e', 'f']
|
51
68
|
# Read specific columns of 'var'
|
52
|
-
cap_adata.read_var(columns=['
|
53
|
-
|
54
|
-
# Read
|
55
|
-
cap_adata.read_var(
|
69
|
+
cap_adata.read_var(columns=['d'])
|
70
|
+
cap_adata.var.columns # ['d']
|
71
|
+
# Read additional column
|
72
|
+
cap_adata.read_var(columns=['e'])
|
73
|
+
cap_adata.var.columns # ['d', 'e']
|
74
|
+
|
75
|
+
# Read column and reset the in-memory DataFrame before that
|
76
|
+
cap_adata.read_var(columns=['f'], reset=True)
|
77
|
+
cap_adata.var.columns # ['f']
|
78
|
+
|
79
|
+
# Read no columns of raw.var (only the index)
|
80
|
+
cap_adata.raw.read_var(columns=[])
|
56
81
|
```
|
57
82
|
|
58
|
-
#####
|
83
|
+
##### Difference between `obs_keys()` and `obs.columns`
|
84
|
+
`obs_keys()` returns the list of columns in the on-disc AnnData file, while `obs.columns` returns the list of columns in the in-memory DataFrame. The two lists may differ if you read only specific columns. If you modify the in-memory DataFrame, the `obs_keys()` will reflect the changes. BTW it is recommended to check the `obs_keys()` before the `overwrite()` call to avoid the AnnData file damage.
|
59
85
|
|
60
|
-
If a column doesn't exist in the file, no error will be raised but the column will be missing in the resulting
|
86
|
+
If a column doesn't exist in the file, no error will be raised but the column will be missing in the resulting DataFrame. So, the list of columns saying more like "try to read this columns from the file". It is needed because we there is no way yet to check if the column exists before the read. Exactly the same behavior is for the `var_keys()` and `var.columns`.
|
61
87
|
|
62
|
-
#### 2. Modify the AnnData File
|
88
|
+
#### 2. Modify the AnnData File DataFrames In-Place
|
63
89
|
|
64
90
|
You can directly modify the dataframe by adding, renaming, or removing columns.
|
65
91
|
|
@@ -68,13 +94,14 @@ You can directly modify the dataframe by adding, renaming, or removing columns.
|
|
68
94
|
cap_adata.obs['new_col'] = [value1, value2, value3]
|
69
95
|
|
70
96
|
# Rename a column
|
71
|
-
cap_adata.rename_column('old_col_name', 'new_col_name')
|
97
|
+
cap_adata.obs.rename_column('old_col_name', 'new_col_name')
|
72
98
|
|
73
99
|
# Remove a column
|
74
|
-
cap_adata.remove_column('col_to_remove')
|
100
|
+
cap_adata.obs.remove_column('col_to_remove')
|
75
101
|
```
|
76
102
|
|
77
103
|
After modifications, you can overwrite the changes back to the AnnData file. If a value doesn't exist, it will be created.
|
104
|
+
Note: `read_h5ad` must be called with `edit=True` argument to open `.h5ad` file in `r+` mode.
|
78
105
|
|
79
106
|
```python
|
80
107
|
# overwrite all values which were read
|
@@ -84,7 +111,7 @@ cap_adata.overwrite()
|
|
84
111
|
cap_adata.overwrite(['obs', 'var'])
|
85
112
|
```
|
86
113
|
|
87
|
-
The full list of supported fields: `
|
114
|
+
The full list of supported fields: `obs`, `var`, `raw.var`, `obsm`, `uns`.
|
88
115
|
|
89
116
|
#### 3. How to Read Few Columns but Overwrite One in a Dataframe
|
90
117
|
|
@@ -100,14 +127,19 @@ cap_adata.obs.drop(columns='sample', inplace=True)
|
|
100
127
|
|
101
128
|
# Overwrite changes
|
102
129
|
cap_adata.overwrite(['obs'])
|
130
|
+
|
131
|
+
# NOTE that the line
|
132
|
+
# cap_adata.read_obs(columns=['sample'], reset=True)
|
133
|
+
# Will override in-memory changes with values from the AnnData file
|
103
134
|
```
|
104
135
|
|
105
136
|
#### 4. How to work with X and raw.X
|
106
137
|
|
107
|
-
The CapAnnData package won't read any field by default. However, the `X` and `raw.X` will be linked to the backed matrices automatically upon the first request to those fields.
|
138
|
+
The CapAnnData package won't read any field by default. However, the `X` and `raw.X` will be linked to the backed matrices automatically upon the first request to those fields.
|
139
|
+
The X object will be returned as the `h5py.Dataset` or `AnnData.experimental.sparse_dataset`.
|
108
140
|
|
109
141
|
```python
|
110
|
-
with
|
142
|
+
with read_h5ad(file_path=file_path, edit=False) as cap_adata:
|
111
143
|
# self.X is None here
|
112
144
|
cap_adata = CapAnnData(file)
|
113
145
|
|
@@ -135,13 +167,13 @@ s_ = np.s_[mask, :5]
|
|
135
167
|
|
136
168
|
#### 5. How to handle obsm embeddings matrixes
|
137
169
|
|
138
|
-
By the default the CapAnnData will not read the embeddings matrix.
|
170
|
+
By the default the CapAnnData will not read the embeddings matrix.
|
171
|
+
The link to the h5py objects will be created upon the first call of the `.obsm` property.
|
172
|
+
Alike the AnnData package the call like `cap_adata.obsm["X_tsne"]` will not return the in-memory matrix but will return the backed version instead.
|
173
|
+
It is possible to get the information about the name and shape of the embeddings without taking the whole matrix in the memory.
|
139
174
|
|
140
175
|
```python
|
141
|
-
with
|
142
|
-
# initialization
|
143
|
-
cap_adata = CapAnnData(file)
|
144
|
-
|
176
|
+
with read_h5ad(file_path=file_path, edit=False) as cap_adata:
|
145
177
|
# will return the list of strings
|
146
178
|
obsm_keys = cap_adata.obsm_keys()
|
147
179
|
|
@@ -158,10 +190,7 @@ with h5py.File(path) as file:
|
|
158
190
|
The `CapAnnData` class will lazely link the uns section upon the first call but ***WILL NOT*** read it into memory. Instead, the dictionary of the pairs `{'key': "__NotLinkedObject"}` will be creted. It allow to get the list of keys before the actual read. To read the uns section in the memory the `.read_uns(keys)` method must be called.
|
159
191
|
|
160
192
|
```python
|
161
|
-
with
|
162
|
-
# initialization
|
163
|
-
cap_adata = CapAnnData(file)
|
164
|
-
|
193
|
+
with read_h5ad(file_path=file_path, edit=True) as cap_adata:
|
165
194
|
# will return the keys() object
|
166
195
|
keys = cap_adata.uns.keys()
|
167
196
|
|
@@ -197,3 +226,28 @@ To save `uns` changes the method `CapAnnData.overwrite()` must be called.
|
|
197
226
|
cap_adata.overwrite() # all in-memory fields will be overwritten
|
198
227
|
cap_adata.overwrite(["uns"]) # overwrite the uns secion only
|
199
228
|
```
|
229
|
+
|
230
|
+
#### 7. Join and Merge DataFrames
|
231
|
+
|
232
|
+
Cap-AnnData provides enhanced methods for joining and merging dataframes, preserving column order and data integrity
|
233
|
+
|
234
|
+
```python
|
235
|
+
from cap_anndata import CapAnnDataDF
|
236
|
+
import pandas as pd
|
237
|
+
|
238
|
+
data1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
239
|
+
data2 = pd.DataFrame({'D': [7, 8, 9], 'E': [10, 11, 12]})
|
240
|
+
cap_anndata_df1 = CapAnnDataDF.from_df(data1, column_order=['A', 'B', 'C'])
|
241
|
+
|
242
|
+
cap_df = cap_anndata_df1.join(data2, how='left')
|
243
|
+
|
244
|
+
cap_df.columns # ['A', 'B', 'D', 'E']
|
245
|
+
cap_df.column_order # ['A', 'B', 'C', 'D', 'E']
|
246
|
+
|
247
|
+
data3 = pd.DataFrame({'A': [2, 3, 4], 'D': [10, 11, 12]})
|
248
|
+
cap_df = cap_anndata_df1.merge(data3, on='A')
|
249
|
+
|
250
|
+
cap_df.columns # ['A', 'B', 'D']
|
251
|
+
cap_df.column_order # ['A', 'B', 'C', 'D']
|
252
|
+
cap_df.shape # (2, 3)
|
253
|
+
```
|
@@ -0,0 +1,10 @@
|
|
1
|
+
cap_anndata/__init__.py,sha256=l9lvFpcMsQksp8_dI-fjUgrImoMdztbu3jVSdmxNPmA,205
|
2
|
+
cap_anndata/backed_df.py,sha256=06wZwEjszFQ8lkvy6-GgD_SD14idu9857RtlfMQiBjE,2691
|
3
|
+
cap_anndata/backed_uns.py,sha256=Tfxoz3RgcgENf4SvxFOox9w048K2QmBTh1VbAf4yqVI,854
|
4
|
+
cap_anndata/cap_anndata.py,sha256=nv5f7A9jyK_rZ2kx54XvnX-V65MFlE3CYQC-n_zBhB8,10097
|
5
|
+
cap_anndata/reader.py,sha256=kg9xoS_S0gY6WpsHE8PwGMa14VXh9Ibqjw4bwoerYsE,1267
|
6
|
+
cap_anndata-0.2.1.dist-info/LICENSE,sha256=JAV0w7TBl6wQe9iFcCKjAWgpurym0f-Q0B75zm2PrKw,1560
|
7
|
+
cap_anndata-0.2.1.dist-info/METADATA,sha256=KuFmqvbkQ4O61na6ifNlekuL6t2NU2NxrSV3npXUfMg,9569
|
8
|
+
cap_anndata-0.2.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
9
|
+
cap_anndata-0.2.1.dist-info/top_level.txt,sha256=GKi_Uk4LUhXwWBfFCTIyJvEoJqFREt_4uH4CWgeLsg4,12
|
10
|
+
cap_anndata-0.2.1.dist-info/RECORD,,
|
@@ -1,9 +0,0 @@
|
|
1
|
-
cap_anndata/__init__.py,sha256=4Vex9i79uTgNQZo_yiEuNc0KoLXPs5Awv87KWmDbxzM,143
|
2
|
-
cap_anndata/backed_df.py,sha256=Ce74WHzXhebYRORx7yjVJD02XCcF5j1SxvmbTIpwzCA,1418
|
3
|
-
cap_anndata/backed_uns.py,sha256=Tfxoz3RgcgENf4SvxFOox9w048K2QmBTh1VbAf4yqVI,854
|
4
|
-
cap_anndata/cap_anndata.py,sha256=4mpsJjEgrmJGSVaY0cYp_mM6CTquI9NC-oYXbZxUjH4,7815
|
5
|
-
cap_anndata-0.1.1.dist-info/LICENSE,sha256=JAV0w7TBl6wQe9iFcCKjAWgpurym0f-Q0B75zm2PrKw,1560
|
6
|
-
cap_anndata-0.1.1.dist-info/METADATA,sha256=7xIKTN7cO4jbenuUTXEXJcJUCFJZwwZPN7VvE5Fp7EE,7123
|
7
|
-
cap_anndata-0.1.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
8
|
-
cap_anndata-0.1.1.dist-info/top_level.txt,sha256=GKi_Uk4LUhXwWBfFCTIyJvEoJqFREt_4uH4CWgeLsg4,12
|
9
|
-
cap_anndata-0.1.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|