cap-anndata 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- cap_anndata/__init__.py +10 -10
- cap_anndata/backed_df.py +69 -69
- cap_anndata/backed_dict.py +34 -34
- cap_anndata/cap_anndata.py +600 -600
- cap_anndata/reader.py +57 -57
- {cap_anndata-0.3.0.dist-info → cap_anndata-0.3.1.dist-info}/LICENSE +28 -28
- {cap_anndata-0.3.0.dist-info → cap_anndata-0.3.1.dist-info}/METADATA +56 -54
- cap_anndata-0.3.1.dist-info/RECORD +10 -0
- {cap_anndata-0.3.0.dist-info → cap_anndata-0.3.1.dist-info}/WHEEL +1 -1
- cap_anndata-0.3.0.dist-info/RECORD +0 -10
- {cap_anndata-0.3.0.dist-info → cap_anndata-0.3.1.dist-info}/top_level.txt +0 -0
cap_anndata/cap_anndata.py
CHANGED
@@ -1,600 +1,600 @@
|
|
1
|
-
import logging
|
2
|
-
import anndata as ad
|
3
|
-
import numpy as np
|
4
|
-
import h5py
|
5
|
-
from typing import List, Union, Any, Tuple, Final
|
6
|
-
import scipy.sparse as ss
|
7
|
-
from packaging import version
|
8
|
-
|
9
|
-
if version.parse(ad.__version__) < version.parse("0.11.0"):
|
10
|
-
from anndata.experimental import sparse_dataset, read_elem, write_elem
|
11
|
-
else:
|
12
|
-
from anndata import sparse_dataset, read_elem, write_elem
|
13
|
-
|
14
|
-
from cap_anndata import CapAnnDataDF, CapAnnDataDict
|
15
|
-
|
16
|
-
logger = logging.getLogger(__name__)
|
17
|
-
|
18
|
-
X_NOTATION = Union[
|
19
|
-
h5py.Dataset, ad.experimental.CSRDataset, ad.experimental.CSCDataset, None
|
20
|
-
]
|
21
|
-
ARRAY_MAPPING_NOTATION = CapAnnDataDict[str, X_NOTATION]
|
22
|
-
|
23
|
-
NotLinkedObject: Final = "__NotLinkedObject"
|
24
|
-
|
25
|
-
|
26
|
-
class BaseLayerMatrixAndDf:
|
27
|
-
def __init__(self, file: h5py.File, path_to_content: str = "/") -> None:
|
28
|
-
self._file = file
|
29
|
-
self._path_to_content = path_to_content
|
30
|
-
self._X: X_NOTATION = None
|
31
|
-
|
32
|
-
@property
|
33
|
-
def file(self) -> h5py.File:
|
34
|
-
return self._file
|
35
|
-
|
36
|
-
@property
|
37
|
-
def X(self) -> X_NOTATION:
|
38
|
-
if self._X is None:
|
39
|
-
self._link_x()
|
40
|
-
return self._X
|
41
|
-
|
42
|
-
def _link_x(self) -> None:
|
43
|
-
x = self._file[self._path_to_content + "X"]
|
44
|
-
if isinstance(x, h5py.Dataset):
|
45
|
-
# dense X
|
46
|
-
self._X = x
|
47
|
-
else:
|
48
|
-
# sparse dataset
|
49
|
-
self._X = sparse_dataset(x)
|
50
|
-
|
51
|
-
@property
|
52
|
-
def shape(self) -> Tuple[int, int]:
|
53
|
-
if self.X is not None:
|
54
|
-
shape = tuple(map(int, self.X.shape))
|
55
|
-
else:
|
56
|
-
shape = None
|
57
|
-
return shape
|
58
|
-
|
59
|
-
def _lazy_df_load(self, key: str) -> CapAnnDataDF:
|
60
|
-
df = CapAnnDataDF()
|
61
|
-
attribute = self._path_to_content + key
|
62
|
-
column_order = self._read_attr(self._file[attribute], "column-order")
|
63
|
-
df.column_order = column_order
|
64
|
-
if df.column_order.dtype != object:
|
65
|
-
# empty DataFrame will have column_order as float64
|
66
|
-
# which leads to failure in overwrite method
|
67
|
-
df.column_order = df.column_order.astype(object)
|
68
|
-
return df
|
69
|
-
|
70
|
-
@staticmethod
|
71
|
-
def _read_attr(obj: Union[h5py.Group, h5py.Dataset], attr_name: str) -> any:
|
72
|
-
attrs = dict(obj.attrs)
|
73
|
-
if attr_name not in attrs.keys():
|
74
|
-
raise KeyError(f"The {attr_name} doesn't exist!")
|
75
|
-
return attrs[attr_name]
|
76
|
-
|
77
|
-
def _read_df(self, key: str, columns: List[str]) -> CapAnnDataDF:
|
78
|
-
group_path = self._path_to_content + key
|
79
|
-
if group_path not in self._file.keys():
|
80
|
-
raise ValueError(f"The group {group_path} doesn't exist in the file!")
|
81
|
-
|
82
|
-
h5_group = self._file[group_path]
|
83
|
-
|
84
|
-
column_order = self._read_attr(h5_group, "column-order")
|
85
|
-
|
86
|
-
if columns is None:
|
87
|
-
# read whole df
|
88
|
-
df = CapAnnDataDF.from_df(read_elem(h5_group), column_order=column_order)
|
89
|
-
else:
|
90
|
-
if isinstance(columns, str):
|
91
|
-
# single column provided instead of list
|
92
|
-
columns = [columns]
|
93
|
-
cols_to_read = [c for c in columns if c in column_order]
|
94
|
-
df = CapAnnDataDF()
|
95
|
-
df.column_order = column_order
|
96
|
-
index_col = self._read_attr(h5_group, "_index")
|
97
|
-
df.index = read_elem(h5_group[index_col])
|
98
|
-
|
99
|
-
for col in cols_to_read:
|
100
|
-
df[col] = read_elem(h5_group[col])
|
101
|
-
|
102
|
-
if df.column_order.dtype != object:
|
103
|
-
# empty DataFrame will have column_order as float64
|
104
|
-
# which leads to failure in overwrite method
|
105
|
-
df.column_order = df.column_order.astype(object)
|
106
|
-
return df
|
107
|
-
|
108
|
-
def _write_elem(self, dest_key: str, elem: any, compression: str) -> None:
|
109
|
-
write_elem(
|
110
|
-
self._file, dest_key, elem, dataset_kwargs={"compression": compression}
|
111
|
-
)
|
112
|
-
|
113
|
-
def _validate_cap_df(self, cap_df: CapAnnDataDF, axis: int) -> None:
|
114
|
-
if not isinstance(cap_df, CapAnnDataDF):
|
115
|
-
raise TypeError(
|
116
|
-
f"The input should be an instance of CapAnnDataDF class but {type(cap_df)} given!"
|
117
|
-
)
|
118
|
-
|
119
|
-
if axis not in [0, 1]:
|
120
|
-
raise ValueError("The axis should be either 0 or 1!")
|
121
|
-
|
122
|
-
if cap_df.shape[0] != self.shape[axis]:
|
123
|
-
items = "cells" if axis == 0 else "genes"
|
124
|
-
raise ValueError(
|
125
|
-
f"The number of rows in the input DataFrame should be equal to the number of {items} in the "
|
126
|
-
"AnnData object!"
|
127
|
-
)
|
128
|
-
|
129
|
-
def _link_array_mapping(self, cap_dict: CapAnnDataDict, key: str) -> None:
|
130
|
-
"""Method to update given cap_dict with backed array entities from the file."""
|
131
|
-
if key not in self._file.keys():
|
132
|
-
raise KeyError(f"The key {key} doesn't exist in the file! Ignore linking.")
|
133
|
-
|
134
|
-
group = self._file[key]
|
135
|
-
if not isinstance(group, h5py.Group):
|
136
|
-
raise ValueError(f"The object {key} must be a group!")
|
137
|
-
|
138
|
-
for array_name in group.keys():
|
139
|
-
array = group[array_name]
|
140
|
-
if isinstance(array, h5py.Dataset):
|
141
|
-
cap_dict[array_name] = array
|
142
|
-
elif isinstance(array, h5py.Group):
|
143
|
-
cap_dict[array_name] = sparse_dataset(array)
|
144
|
-
else:
|
145
|
-
raise ValueError(
|
146
|
-
f"Can't link array in {key} due to unsupported type of object: {type(array)}"
|
147
|
-
)
|
148
|
-
|
149
|
-
def _create_new_matrix(
|
150
|
-
self,
|
151
|
-
dest: str,
|
152
|
-
matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
|
153
|
-
matrix_shape: Union[tuple[int, int], None] = None,
|
154
|
-
data_dtype: Union[np.dtype, None] = None,
|
155
|
-
format: Union[str, None] = None, # TODO: use Enum instead of str
|
156
|
-
compression: str = "lzf",
|
157
|
-
) -> None:
|
158
|
-
if matrix is not None:
|
159
|
-
self._write_elem(dest, matrix, compression=compression)
|
160
|
-
else:
|
161
|
-
if format == "dense":
|
162
|
-
group = self._file.create_dataset(
|
163
|
-
name=dest,
|
164
|
-
shape=matrix_shape,
|
165
|
-
dtype=data_dtype,
|
166
|
-
compression=compression,
|
167
|
-
)
|
168
|
-
# https://anndata.readthedocs.io/en/latest/fileformat-prose.html#dense-arrays-specification-v0-2-0
|
169
|
-
group.attrs["encoding-type"] = "array"
|
170
|
-
group.attrs["encoding-version"] = "0.2.0"
|
171
|
-
elif format in [
|
172
|
-
"csr",
|
173
|
-
"csc",
|
174
|
-
]: # Based on https://github.com/appier/h5sparse/blob/master/h5sparse/h5sparse.py
|
175
|
-
if data_dtype is None:
|
176
|
-
data_dtype = np.float64
|
177
|
-
if matrix_shape is None:
|
178
|
-
matrix_shape = (0, 0)
|
179
|
-
sparse_class = ss.csr_matrix if format == "csr" else ss.csc_matrix
|
180
|
-
data = sparse_class(matrix_shape, dtype=data_dtype)
|
181
|
-
self._write_elem(dest, data, compression=compression)
|
182
|
-
else:
|
183
|
-
raise NotImplementedError(
|
184
|
-
f"Format must be 'dense', 'csr' or 'csc' but {format} given!"
|
185
|
-
)
|
186
|
-
|
187
|
-
|
188
|
-
class RawLayer(BaseLayerMatrixAndDf):
|
189
|
-
def __init__(self, h5_file: h5py.File):
|
190
|
-
super().__init__(h5_file, path_to_content="/raw/")
|
191
|
-
self._var: CapAnnDataDF = None
|
192
|
-
|
193
|
-
@property
|
194
|
-
def var(self) -> CapAnnDataDF:
|
195
|
-
if self._var is None:
|
196
|
-
self._var = self._lazy_df_load("var")
|
197
|
-
return self._var
|
198
|
-
|
199
|
-
@var.setter
|
200
|
-
def var(self, cap_df: CapAnnDataDF) -> None:
|
201
|
-
self._validate_cap_df(cap_df, axis=1)
|
202
|
-
self._var = cap_df
|
203
|
-
|
204
|
-
def read_var(self, columns: List[str] = None, reset: bool = False) -> None:
|
205
|
-
df = self._read_df(key="var", columns=columns)
|
206
|
-
if self.var.empty or reset:
|
207
|
-
self._var = df
|
208
|
-
else:
|
209
|
-
for col in df.columns:
|
210
|
-
self._var[col] = df[col]
|
211
|
-
|
212
|
-
|
213
|
-
class CapAnnData(BaseLayerMatrixAndDf):
|
214
|
-
def __init__(self, h5_file: h5py.File) -> None:
|
215
|
-
super().__init__(h5_file, path_to_content="/")
|
216
|
-
self._file: h5py.File = h5_file
|
217
|
-
self._obs: CapAnnDataDF = None
|
218
|
-
self._var: CapAnnDataDF = None
|
219
|
-
self._X: X_NOTATION = None
|
220
|
-
self._obsm: CapAnnDataDict = None
|
221
|
-
self._varm: CapAnnDataDict = None
|
222
|
-
self._layers: CapAnnDataDict = None
|
223
|
-
self._uns: CapAnnDataDict = None
|
224
|
-
self._obsp: CapAnnDataDict = None
|
225
|
-
self._varp: CapAnnDataDict = None
|
226
|
-
self._raw: RawLayer = None
|
227
|
-
self._shape: Tuple[int, int] = None
|
228
|
-
|
229
|
-
@property
|
230
|
-
def obs(self) -> CapAnnDataDF:
|
231
|
-
if self._obs is None:
|
232
|
-
self._obs = self._lazy_df_load("obs")
|
233
|
-
return self._obs
|
234
|
-
|
235
|
-
@obs.setter
|
236
|
-
def obs(self, cap_df: CapAnnDataDF) -> None:
|
237
|
-
self._validate_cap_df(cap_df, axis=0)
|
238
|
-
self._obs = cap_df
|
239
|
-
|
240
|
-
@property
|
241
|
-
def var(self) -> CapAnnDataDF:
|
242
|
-
if self._var is None:
|
243
|
-
self._var = self._lazy_df_load("var")
|
244
|
-
return self._var
|
245
|
-
|
246
|
-
@var.setter
|
247
|
-
def var(self, cap_df: CapAnnDataDF) -> None:
|
248
|
-
self._validate_cap_df(cap_df, axis=1)
|
249
|
-
self._var = cap_df
|
250
|
-
|
251
|
-
@property
|
252
|
-
def raw(self) -> RawLayer:
|
253
|
-
if self._raw is None:
|
254
|
-
if "raw" not in self._file.keys():
|
255
|
-
logger.warning("Can't read raw.var since raw layer doesn't exist!")
|
256
|
-
return
|
257
|
-
|
258
|
-
if len(self._file["raw"].keys()) == 0:
|
259
|
-
logger.warning("The raw layer is empty!")
|
260
|
-
return
|
261
|
-
|
262
|
-
self._raw = RawLayer(self._file)
|
263
|
-
return self._raw
|
264
|
-
|
265
|
-
@property
|
266
|
-
def uns(self) -> CapAnnDataDict[str, Any]:
|
267
|
-
if self._uns is None:
|
268
|
-
self._uns = CapAnnDataDict(
|
269
|
-
{k: NotLinkedObject for k in self._file["uns"].keys()}
|
270
|
-
)
|
271
|
-
return self._uns
|
272
|
-
|
273
|
-
@property
|
274
|
-
def layers(self) -> CapAnnDataDict[str, X_NOTATION]:
|
275
|
-
if self._layers is None:
|
276
|
-
self._link_layers()
|
277
|
-
return self._layers
|
278
|
-
|
279
|
-
@property
|
280
|
-
def obsm(self) -> CapAnnDataDict[str, X_NOTATION]:
|
281
|
-
if self._obsm is None:
|
282
|
-
self._link_obsm()
|
283
|
-
return self._obsm
|
284
|
-
|
285
|
-
@property
|
286
|
-
def varm(self) -> CapAnnDataDict[str, X_NOTATION]:
|
287
|
-
if self._varm is None:
|
288
|
-
self._link_varm()
|
289
|
-
return self._varm
|
290
|
-
|
291
|
-
@property
|
292
|
-
def obsp(self) -> CapAnnDataDict[str, X_NOTATION]:
|
293
|
-
if self._obsp is None:
|
294
|
-
self._link_obsp()
|
295
|
-
return self._obsp
|
296
|
-
|
297
|
-
@property
|
298
|
-
def varp(self) -> CapAnnDataDict[str, X_NOTATION]:
|
299
|
-
if self._varp is None:
|
300
|
-
self._link_varp()
|
301
|
-
return self._varp
|
302
|
-
|
303
|
-
def read_obs(self, columns: List[str] = None, reset: bool = False) -> None:
|
304
|
-
df = self._read_df("obs", columns=columns)
|
305
|
-
if self.obs.empty or reset:
|
306
|
-
self._obs = df
|
307
|
-
else:
|
308
|
-
for col in df.columns:
|
309
|
-
self._obs[col] = df[col]
|
310
|
-
|
311
|
-
def read_var(self, columns: List[str] = None, reset: bool = False) -> None:
|
312
|
-
df = self._read_df("var", columns=columns)
|
313
|
-
if self.var.empty or reset:
|
314
|
-
self._var = df
|
315
|
-
else:
|
316
|
-
for col in df.columns:
|
317
|
-
self._var[col] = df[col]
|
318
|
-
|
319
|
-
def read_uns(self, keys: List[str] = None) -> None:
|
320
|
-
if keys is None:
|
321
|
-
keys = list(self.uns.keys())
|
322
|
-
|
323
|
-
for key in keys:
|
324
|
-
existing_keys = self.uns.keys()
|
325
|
-
if key in existing_keys:
|
326
|
-
source = self._file[f"uns/{key}"]
|
327
|
-
self.uns[key] = read_elem(source)
|
328
|
-
|
329
|
-
def _link_layers(self) -> None:
|
330
|
-
if self._layers is None:
|
331
|
-
self._layers = CapAnnDataDict()
|
332
|
-
if "layers" in self._file.keys():
|
333
|
-
self._link_array_mapping(cap_dict=self._layers, key="layers")
|
334
|
-
|
335
|
-
def _link_obsm(self) -> None:
|
336
|
-
key = "obsm"
|
337
|
-
if self._obsm is None:
|
338
|
-
self._obsm = CapAnnDataDict()
|
339
|
-
if key in self._file.keys():
|
340
|
-
self._link_array_mapping(cap_dict=self._obsm, key=key)
|
341
|
-
|
342
|
-
def _link_varm(self) -> None:
|
343
|
-
key = "varm"
|
344
|
-
if self._varm is None:
|
345
|
-
self._varm = CapAnnDataDict()
|
346
|
-
if key in self._file.keys():
|
347
|
-
self._link_array_mapping(cap_dict=self._varm, key=key)
|
348
|
-
|
349
|
-
def _link_obsp(self):
|
350
|
-
key = "obsp"
|
351
|
-
if self._obsp is None:
|
352
|
-
self._obsp = CapAnnDataDict()
|
353
|
-
|
354
|
-
if key in self._file.keys():
|
355
|
-
self._link_array_mapping(cap_dict=self._obsp, key=key)
|
356
|
-
|
357
|
-
def _link_varp(self):
|
358
|
-
key = "varp"
|
359
|
-
if self._varp is None:
|
360
|
-
self._varp = CapAnnDataDict()
|
361
|
-
|
362
|
-
if key in self._file.keys():
|
363
|
-
self._link_array_mapping(cap_dict=self._varp, key=key)
|
364
|
-
|
365
|
-
def obsm_keys(self) -> List[str]:
|
366
|
-
return list(self.obsm.keys())
|
367
|
-
|
368
|
-
def obs_keys(self) -> List[str]:
|
369
|
-
return self.obs.column_order.tolist()
|
370
|
-
|
371
|
-
def var_keys(self) -> List[str]:
|
372
|
-
return self.var.column_order.tolist()
|
373
|
-
|
374
|
-
def overwrite(self, fields: List[str] = None, compression: str = "lzf") -> None:
|
375
|
-
field_to_entity = {
|
376
|
-
"obs": self.obs,
|
377
|
-
"var": self.var,
|
378
|
-
"raw.var": self.raw.var if self.raw is not None else None,
|
379
|
-
"uns": self.uns,
|
380
|
-
"layers": self.layers,
|
381
|
-
"obsm": self.obsm,
|
382
|
-
"varm": self.varm,
|
383
|
-
"obsp": self.obsp,
|
384
|
-
"varp": self.varp,
|
385
|
-
}
|
386
|
-
|
387
|
-
if fields is None:
|
388
|
-
fields = list(field_to_entity.keys())
|
389
|
-
else:
|
390
|
-
for f in fields:
|
391
|
-
if f not in field_to_entity.keys():
|
392
|
-
raise KeyError(
|
393
|
-
f"The field {f} is not supported! The list of supported fields are equal to supported "
|
394
|
-
f"attributes of the CapAnnData class: obs, var, raw.var and uns."
|
395
|
-
)
|
396
|
-
|
397
|
-
for key in ["obs", "var", "raw.var"]:
|
398
|
-
if key in fields:
|
399
|
-
entity: CapAnnDataDF = field_to_entity[key]
|
400
|
-
if entity is None:
|
401
|
-
continue
|
402
|
-
|
403
|
-
key = key.replace(".", "/") if key == "raw.var" else key
|
404
|
-
|
405
|
-
for col in entity.columns:
|
406
|
-
self._write_elem(
|
407
|
-
f"{key}/{col}", entity[col].values, compression=compression
|
408
|
-
)
|
409
|
-
|
410
|
-
column_order = entity.column_order
|
411
|
-
if (
|
412
|
-
column_order.size == 0
|
413
|
-
): # Refs https://github.com/cellannotation/cap-anndata/issues/6
|
414
|
-
column_order = np.array([], dtype=np.float64)
|
415
|
-
self._file[key].attrs["column-order"] = column_order
|
416
|
-
|
417
|
-
if "uns" in fields:
|
418
|
-
for key in self.uns.keys():
|
419
|
-
if self.uns[key] is not NotLinkedObject:
|
420
|
-
dest = f"uns/{key}"
|
421
|
-
self._write_elem(dest, self.uns[key], compression=compression)
|
422
|
-
for key in self.uns.keys_to_remove:
|
423
|
-
del self._file[f"uns/{key}"]
|
424
|
-
|
425
|
-
for field in ["layers", "obsm", "varm", "obsp", "varp"]:
|
426
|
-
if field in fields:
|
427
|
-
for key in field_to_entity[field].keys_to_remove:
|
428
|
-
del self._file[f"{field}/{key}"]
|
429
|
-
|
430
|
-
def create_layer(
|
431
|
-
self,
|
432
|
-
name: str,
|
433
|
-
matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
|
434
|
-
matrix_shape: Union[tuple[int, int], None] = None,
|
435
|
-
data_dtype: Union[np.dtype, None] = None,
|
436
|
-
format: Union[str, None] = None,
|
437
|
-
compression: str = "lzf",
|
438
|
-
) -> None:
|
439
|
-
"""
|
440
|
-
The empty layer will be created in the case of `matrix` is None.
|
441
|
-
"""
|
442
|
-
self._create_new_matrix_in_field(
|
443
|
-
field="layers",
|
444
|
-
name=name,
|
445
|
-
matrix=matrix,
|
446
|
-
matrix_shape=matrix_shape,
|
447
|
-
data_dtype=data_dtype,
|
448
|
-
format=format,
|
449
|
-
compression=compression,
|
450
|
-
)
|
451
|
-
self._link_layers()
|
452
|
-
|
453
|
-
def create_obsm(
|
454
|
-
self,
|
455
|
-
name: str,
|
456
|
-
matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
|
457
|
-
matrix_shape: Union[tuple[int, int], None] = None,
|
458
|
-
data_dtype: Union[np.dtype, None] = None,
|
459
|
-
format: Union[str, None] = None,
|
460
|
-
compression: str = "lzf",
|
461
|
-
) -> None:
|
462
|
-
self._create_new_matrix_in_field(
|
463
|
-
field="obsm",
|
464
|
-
name=name,
|
465
|
-
matrix=matrix,
|
466
|
-
matrix_shape=matrix_shape,
|
467
|
-
data_dtype=data_dtype,
|
468
|
-
format=format,
|
469
|
-
compression=compression,
|
470
|
-
)
|
471
|
-
self._link_obsm()
|
472
|
-
|
473
|
-
def create_varm(
|
474
|
-
self,
|
475
|
-
name: str,
|
476
|
-
matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
|
477
|
-
matrix_shape: Union[tuple[int, int], None] = None,
|
478
|
-
data_dtype: Union[np.dtype, None] = None,
|
479
|
-
format: Union[str, None] = None,
|
480
|
-
compression: str = "lzf",
|
481
|
-
) -> None:
|
482
|
-
self._create_new_matrix_in_field(
|
483
|
-
field="varm",
|
484
|
-
name=name,
|
485
|
-
matrix=matrix,
|
486
|
-
matrix_shape=matrix_shape,
|
487
|
-
data_dtype=data_dtype,
|
488
|
-
format=format,
|
489
|
-
compression=compression,
|
490
|
-
)
|
491
|
-
self._link_varm()
|
492
|
-
|
493
|
-
def create_obsp(
|
494
|
-
self,
|
495
|
-
name: str,
|
496
|
-
matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
|
497
|
-
matrix_shape: Union[tuple[int, int], None] = None,
|
498
|
-
data_dtype: Union[np.dtype, None] = None,
|
499
|
-
format: Union[str, None] = None,
|
500
|
-
compression: str = "lzf",
|
501
|
-
) -> None:
|
502
|
-
self._create_new_matrix_in_field(
|
503
|
-
field="obsp",
|
504
|
-
name=name,
|
505
|
-
matrix=matrix,
|
506
|
-
matrix_shape=matrix_shape,
|
507
|
-
data_dtype=data_dtype,
|
508
|
-
format=format,
|
509
|
-
compression=compression,
|
510
|
-
)
|
511
|
-
self._link_obsp()
|
512
|
-
|
513
|
-
def create_varp(
|
514
|
-
self,
|
515
|
-
name: str,
|
516
|
-
matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
|
517
|
-
matrix_shape: Union[tuple[int, int], None] = None,
|
518
|
-
data_dtype: Union[np.dtype, None] = None,
|
519
|
-
format: Union[str, None] = None,
|
520
|
-
compression: str = "lzf",
|
521
|
-
) -> None:
|
522
|
-
|
523
|
-
self._create_new_matrix_in_field(
|
524
|
-
field="varp",
|
525
|
-
name=name,
|
526
|
-
matrix=matrix,
|
527
|
-
matrix_shape=matrix_shape,
|
528
|
-
data_dtype=data_dtype,
|
529
|
-
format=format,
|
530
|
-
compression=compression,
|
531
|
-
)
|
532
|
-
self._link_varp()
|
533
|
-
|
534
|
-
def _create_new_matrix_in_field(self, field, name, **kwargs):
|
535
|
-
"""**kwargs: matrix, matrix_shape, data_dtype, format, compression"""
|
536
|
-
dest = f"{field}/{name}"
|
537
|
-
field_entity = getattr(self, field)
|
538
|
-
if name in field_entity.keys():
|
539
|
-
raise ValueError(
|
540
|
-
f"Please explicitly remove the existing '{name}' entity from {field} "
|
541
|
-
f"before creating a new one!"
|
542
|
-
)
|
543
|
-
if field not in self._file.keys():
|
544
|
-
self._file.create_group(field)
|
545
|
-
self._create_new_matrix(dest=dest, **kwargs)
|
546
|
-
|
547
|
-
def remove_layer(self, name: str) -> None:
|
548
|
-
del self._file[f"layers/{name}"]
|
549
|
-
self._link_layers()
|
550
|
-
|
551
|
-
def remove_obsp(self, name: str) -> None:
|
552
|
-
del self._file[f"obsp/{name}"]
|
553
|
-
self._link_obsp()
|
554
|
-
|
555
|
-
def remove_varp(self, name: str) -> None:
|
556
|
-
del self._file[f"varp/{name}"]
|
557
|
-
self._link_varp()
|
558
|
-
|
559
|
-
def remove_obsm(self, name: str) -> None:
|
560
|
-
del self._file[f"obsm/{name}"]
|
561
|
-
self._link_obsm()
|
562
|
-
|
563
|
-
def remove_varm(self, name: str) -> None:
|
564
|
-
del self._file[f"varm/{name}"]
|
565
|
-
self._link_varm()
|
566
|
-
|
567
|
-
def create_repr(self) -> str:
|
568
|
-
indent = " " * 4
|
569
|
-
s = f"CapAnnData object"
|
570
|
-
s += f"\n{indent}File: {self._file}"
|
571
|
-
s += f"\n{indent}X shape: {self.shape}"
|
572
|
-
s += f"\n{indent}Has raw X: {self.raw is not None}"
|
573
|
-
for field in ["obs", "obsm", "var", "uns", "layers"]:
|
574
|
-
if field in self._file:
|
575
|
-
in_memory = set()
|
576
|
-
if field in ["obs", "var", "uns"]:
|
577
|
-
attr = getattr(self, field)
|
578
|
-
if attr is not None:
|
579
|
-
in_memory = set(attr.keys())
|
580
|
-
keys = list(self._file[field].keys())
|
581
|
-
keys = [k for k in keys if k != "_index"]
|
582
|
-
keys = [(k if k not in in_memory else f"{k}*") for k in keys]
|
583
|
-
keys_str = str(keys).replace("*'", "'*")
|
584
|
-
s += f"\n{indent}{field}: {keys_str}"
|
585
|
-
s += f"\n{indent}Note: fields marked with * are in-memory objects."
|
586
|
-
return s
|
587
|
-
|
588
|
-
def __repr__(self) -> str:
|
589
|
-
return self.create_repr()
|
590
|
-
|
591
|
-
def __str__(self) -> str:
|
592
|
-
return self.create_repr()
|
593
|
-
|
594
|
-
def __enter__(self):
|
595
|
-
return self
|
596
|
-
|
597
|
-
def __exit__(self, *args):
|
598
|
-
if self._file is not None:
|
599
|
-
self._file.close()
|
600
|
-
logger.debug("CapAnnData closed!")
|
1
|
+
import logging
|
2
|
+
import anndata as ad
|
3
|
+
import numpy as np
|
4
|
+
import h5py
|
5
|
+
from typing import List, Union, Any, Tuple, Final
|
6
|
+
import scipy.sparse as ss
|
7
|
+
from packaging import version
|
8
|
+
|
9
|
+
if version.parse(ad.__version__) < version.parse("0.11.0"):
|
10
|
+
from anndata.experimental import sparse_dataset, read_elem, write_elem
|
11
|
+
else:
|
12
|
+
from anndata.io import sparse_dataset, read_elem, write_elem
|
13
|
+
|
14
|
+
from cap_anndata import CapAnnDataDF, CapAnnDataDict
|
15
|
+
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
X_NOTATION = Union[
|
19
|
+
h5py.Dataset, ad.experimental.CSRDataset, ad.experimental.CSCDataset, None
|
20
|
+
]
|
21
|
+
ARRAY_MAPPING_NOTATION = CapAnnDataDict[str, X_NOTATION]
|
22
|
+
|
23
|
+
NotLinkedObject: Final = "__NotLinkedObject"
|
24
|
+
|
25
|
+
|
26
|
+
class BaseLayerMatrixAndDf:
|
27
|
+
def __init__(self, file: h5py.File, path_to_content: str = "/") -> None:
|
28
|
+
self._file = file
|
29
|
+
self._path_to_content = path_to_content
|
30
|
+
self._X: X_NOTATION = None
|
31
|
+
|
32
|
+
@property
|
33
|
+
def file(self) -> h5py.File:
|
34
|
+
return self._file
|
35
|
+
|
36
|
+
@property
|
37
|
+
def X(self) -> X_NOTATION:
|
38
|
+
if self._X is None:
|
39
|
+
self._link_x()
|
40
|
+
return self._X
|
41
|
+
|
42
|
+
def _link_x(self) -> None:
|
43
|
+
x = self._file[self._path_to_content + "X"]
|
44
|
+
if isinstance(x, h5py.Dataset):
|
45
|
+
# dense X
|
46
|
+
self._X = x
|
47
|
+
else:
|
48
|
+
# sparse dataset
|
49
|
+
self._X = sparse_dataset(x)
|
50
|
+
|
51
|
+
@property
|
52
|
+
def shape(self) -> Tuple[int, int]:
|
53
|
+
if self.X is not None:
|
54
|
+
shape = tuple(map(int, self.X.shape))
|
55
|
+
else:
|
56
|
+
shape = None
|
57
|
+
return shape
|
58
|
+
|
59
|
+
def _lazy_df_load(self, key: str) -> CapAnnDataDF:
|
60
|
+
df = CapAnnDataDF()
|
61
|
+
attribute = self._path_to_content + key
|
62
|
+
column_order = self._read_attr(self._file[attribute], "column-order")
|
63
|
+
df.column_order = column_order
|
64
|
+
if df.column_order.dtype != object:
|
65
|
+
# empty DataFrame will have column_order as float64
|
66
|
+
# which leads to failure in overwrite method
|
67
|
+
df.column_order = df.column_order.astype(object)
|
68
|
+
return df
|
69
|
+
|
70
|
+
@staticmethod
|
71
|
+
def _read_attr(obj: Union[h5py.Group, h5py.Dataset], attr_name: str) -> any:
|
72
|
+
attrs = dict(obj.attrs)
|
73
|
+
if attr_name not in attrs.keys():
|
74
|
+
raise KeyError(f"The {attr_name} doesn't exist!")
|
75
|
+
return attrs[attr_name]
|
76
|
+
|
77
|
+
def _read_df(self, key: str, columns: List[str]) -> CapAnnDataDF:
|
78
|
+
group_path = self._path_to_content + key
|
79
|
+
if group_path not in self._file.keys():
|
80
|
+
raise ValueError(f"The group {group_path} doesn't exist in the file!")
|
81
|
+
|
82
|
+
h5_group = self._file[group_path]
|
83
|
+
|
84
|
+
column_order = self._read_attr(h5_group, "column-order")
|
85
|
+
|
86
|
+
if columns is None:
|
87
|
+
# read whole df
|
88
|
+
df = CapAnnDataDF.from_df(read_elem(h5_group), column_order=column_order)
|
89
|
+
else:
|
90
|
+
if isinstance(columns, str):
|
91
|
+
# single column provided instead of list
|
92
|
+
columns = [columns]
|
93
|
+
cols_to_read = [c for c in columns if c in column_order]
|
94
|
+
df = CapAnnDataDF()
|
95
|
+
df.column_order = column_order
|
96
|
+
index_col = self._read_attr(h5_group, "_index")
|
97
|
+
df.index = read_elem(h5_group[index_col])
|
98
|
+
|
99
|
+
for col in cols_to_read:
|
100
|
+
df[col] = read_elem(h5_group[col])
|
101
|
+
|
102
|
+
if df.column_order.dtype != object:
|
103
|
+
# empty DataFrame will have column_order as float64
|
104
|
+
# which leads to failure in overwrite method
|
105
|
+
df.column_order = df.column_order.astype(object)
|
106
|
+
return df
|
107
|
+
|
108
|
+
def _write_elem(self, dest_key: str, elem: any, compression: str) -> None:
|
109
|
+
write_elem(
|
110
|
+
self._file, dest_key, elem, dataset_kwargs={"compression": compression}
|
111
|
+
)
|
112
|
+
|
113
|
+
def _validate_cap_df(self, cap_df: CapAnnDataDF, axis: int) -> None:
|
114
|
+
if not isinstance(cap_df, CapAnnDataDF):
|
115
|
+
raise TypeError(
|
116
|
+
f"The input should be an instance of CapAnnDataDF class but {type(cap_df)} given!"
|
117
|
+
)
|
118
|
+
|
119
|
+
if axis not in [0, 1]:
|
120
|
+
raise ValueError("The axis should be either 0 or 1!")
|
121
|
+
|
122
|
+
if cap_df.shape[0] != self.shape[axis]:
|
123
|
+
items = "cells" if axis == 0 else "genes"
|
124
|
+
raise ValueError(
|
125
|
+
f"The number of rows in the input DataFrame should be equal to the number of {items} in the "
|
126
|
+
"AnnData object!"
|
127
|
+
)
|
128
|
+
|
129
|
+
def _link_array_mapping(self, cap_dict: CapAnnDataDict, key: str) -> None:
|
130
|
+
"""Method to update given cap_dict with backed array entities from the file."""
|
131
|
+
if key not in self._file.keys():
|
132
|
+
raise KeyError(f"The key {key} doesn't exist in the file! Ignore linking.")
|
133
|
+
|
134
|
+
group = self._file[key]
|
135
|
+
if not isinstance(group, h5py.Group):
|
136
|
+
raise ValueError(f"The object {key} must be a group!")
|
137
|
+
|
138
|
+
for array_name in group.keys():
|
139
|
+
array = group[array_name]
|
140
|
+
if isinstance(array, h5py.Dataset):
|
141
|
+
cap_dict[array_name] = array
|
142
|
+
elif isinstance(array, h5py.Group):
|
143
|
+
cap_dict[array_name] = sparse_dataset(array)
|
144
|
+
else:
|
145
|
+
raise ValueError(
|
146
|
+
f"Can't link array in {key} due to unsupported type of object: {type(array)}"
|
147
|
+
)
|
148
|
+
|
149
|
+
def _create_new_matrix(
|
150
|
+
self,
|
151
|
+
dest: str,
|
152
|
+
matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
|
153
|
+
matrix_shape: Union[tuple[int, int], None] = None,
|
154
|
+
data_dtype: Union[np.dtype, None] = None,
|
155
|
+
format: Union[str, None] = None, # TODO: use Enum instead of str
|
156
|
+
compression: str = "lzf",
|
157
|
+
) -> None:
|
158
|
+
if matrix is not None:
|
159
|
+
self._write_elem(dest, matrix, compression=compression)
|
160
|
+
else:
|
161
|
+
if format == "dense":
|
162
|
+
group = self._file.create_dataset(
|
163
|
+
name=dest,
|
164
|
+
shape=matrix_shape,
|
165
|
+
dtype=data_dtype,
|
166
|
+
compression=compression,
|
167
|
+
)
|
168
|
+
# https://anndata.readthedocs.io/en/latest/fileformat-prose.html#dense-arrays-specification-v0-2-0
|
169
|
+
group.attrs["encoding-type"] = "array"
|
170
|
+
group.attrs["encoding-version"] = "0.2.0"
|
171
|
+
elif format in [
|
172
|
+
"csr",
|
173
|
+
"csc",
|
174
|
+
]: # Based on https://github.com/appier/h5sparse/blob/master/h5sparse/h5sparse.py
|
175
|
+
if data_dtype is None:
|
176
|
+
data_dtype = np.float64
|
177
|
+
if matrix_shape is None:
|
178
|
+
matrix_shape = (0, 0)
|
179
|
+
sparse_class = ss.csr_matrix if format == "csr" else ss.csc_matrix
|
180
|
+
data = sparse_class(matrix_shape, dtype=data_dtype)
|
181
|
+
self._write_elem(dest, data, compression=compression)
|
182
|
+
else:
|
183
|
+
raise NotImplementedError(
|
184
|
+
f"Format must be 'dense', 'csr' or 'csc' but {format} given!"
|
185
|
+
)
|
186
|
+
|
187
|
+
|
188
|
+
class RawLayer(BaseLayerMatrixAndDf):
|
189
|
+
def __init__(self, h5_file: h5py.File):
|
190
|
+
super().__init__(h5_file, path_to_content="/raw/")
|
191
|
+
self._var: CapAnnDataDF = None
|
192
|
+
|
193
|
+
@property
|
194
|
+
def var(self) -> CapAnnDataDF:
|
195
|
+
if self._var is None:
|
196
|
+
self._var = self._lazy_df_load("var")
|
197
|
+
return self._var
|
198
|
+
|
199
|
+
@var.setter
|
200
|
+
def var(self, cap_df: CapAnnDataDF) -> None:
|
201
|
+
self._validate_cap_df(cap_df, axis=1)
|
202
|
+
self._var = cap_df
|
203
|
+
|
204
|
+
def read_var(self, columns: List[str] = None, reset: bool = False) -> None:
|
205
|
+
df = self._read_df(key="var", columns=columns)
|
206
|
+
if self.var.empty or reset:
|
207
|
+
self._var = df
|
208
|
+
else:
|
209
|
+
for col in df.columns:
|
210
|
+
self._var[col] = df[col]
|
211
|
+
|
212
|
+
|
213
|
+
class CapAnnData(BaseLayerMatrixAndDf):
|
214
|
+
def __init__(self, h5_file: h5py.File) -> None:
|
215
|
+
super().__init__(h5_file, path_to_content="/")
|
216
|
+
self._file: h5py.File = h5_file
|
217
|
+
self._obs: CapAnnDataDF = None
|
218
|
+
self._var: CapAnnDataDF = None
|
219
|
+
self._X: X_NOTATION = None
|
220
|
+
self._obsm: CapAnnDataDict = None
|
221
|
+
self._varm: CapAnnDataDict = None
|
222
|
+
self._layers: CapAnnDataDict = None
|
223
|
+
self._uns: CapAnnDataDict = None
|
224
|
+
self._obsp: CapAnnDataDict = None
|
225
|
+
self._varp: CapAnnDataDict = None
|
226
|
+
self._raw: RawLayer = None
|
227
|
+
self._shape: Tuple[int, int] = None
|
228
|
+
|
229
|
+
@property
|
230
|
+
def obs(self) -> CapAnnDataDF:
|
231
|
+
if self._obs is None:
|
232
|
+
self._obs = self._lazy_df_load("obs")
|
233
|
+
return self._obs
|
234
|
+
|
235
|
+
@obs.setter
|
236
|
+
def obs(self, cap_df: CapAnnDataDF) -> None:
|
237
|
+
self._validate_cap_df(cap_df, axis=0)
|
238
|
+
self._obs = cap_df
|
239
|
+
|
240
|
+
@property
|
241
|
+
def var(self) -> CapAnnDataDF:
|
242
|
+
if self._var is None:
|
243
|
+
self._var = self._lazy_df_load("var")
|
244
|
+
return self._var
|
245
|
+
|
246
|
+
@var.setter
|
247
|
+
def var(self, cap_df: CapAnnDataDF) -> None:
|
248
|
+
self._validate_cap_df(cap_df, axis=1)
|
249
|
+
self._var = cap_df
|
250
|
+
|
251
|
+
@property
|
252
|
+
def raw(self) -> RawLayer:
|
253
|
+
if self._raw is None:
|
254
|
+
if "raw" not in self._file.keys():
|
255
|
+
logger.warning("Can't read raw.var since raw layer doesn't exist!")
|
256
|
+
return
|
257
|
+
|
258
|
+
if len(self._file["raw"].keys()) == 0:
|
259
|
+
logger.warning("The raw layer is empty!")
|
260
|
+
return
|
261
|
+
|
262
|
+
self._raw = RawLayer(self._file)
|
263
|
+
return self._raw
|
264
|
+
|
265
|
+
@property
|
266
|
+
def uns(self) -> CapAnnDataDict[str, Any]:
|
267
|
+
if self._uns is None:
|
268
|
+
self._uns = CapAnnDataDict(
|
269
|
+
{k: NotLinkedObject for k in self._file["uns"].keys()}
|
270
|
+
)
|
271
|
+
return self._uns
|
272
|
+
|
273
|
+
@property
|
274
|
+
def layers(self) -> CapAnnDataDict[str, X_NOTATION]:
|
275
|
+
if self._layers is None:
|
276
|
+
self._link_layers()
|
277
|
+
return self._layers
|
278
|
+
|
279
|
+
@property
|
280
|
+
def obsm(self) -> CapAnnDataDict[str, X_NOTATION]:
|
281
|
+
if self._obsm is None:
|
282
|
+
self._link_obsm()
|
283
|
+
return self._obsm
|
284
|
+
|
285
|
+
@property
|
286
|
+
def varm(self) -> CapAnnDataDict[str, X_NOTATION]:
|
287
|
+
if self._varm is None:
|
288
|
+
self._link_varm()
|
289
|
+
return self._varm
|
290
|
+
|
291
|
+
@property
|
292
|
+
def obsp(self) -> CapAnnDataDict[str, X_NOTATION]:
|
293
|
+
if self._obsp is None:
|
294
|
+
self._link_obsp()
|
295
|
+
return self._obsp
|
296
|
+
|
297
|
+
@property
|
298
|
+
def varp(self) -> CapAnnDataDict[str, X_NOTATION]:
|
299
|
+
if self._varp is None:
|
300
|
+
self._link_varp()
|
301
|
+
return self._varp
|
302
|
+
|
303
|
+
def read_obs(self, columns: List[str] = None, reset: bool = False) -> None:
|
304
|
+
df = self._read_df("obs", columns=columns)
|
305
|
+
if self.obs.empty or reset:
|
306
|
+
self._obs = df
|
307
|
+
else:
|
308
|
+
for col in df.columns:
|
309
|
+
self._obs[col] = df[col]
|
310
|
+
|
311
|
+
def read_var(self, columns: List[str] = None, reset: bool = False) -> None:
|
312
|
+
df = self._read_df("var", columns=columns)
|
313
|
+
if self.var.empty or reset:
|
314
|
+
self._var = df
|
315
|
+
else:
|
316
|
+
for col in df.columns:
|
317
|
+
self._var[col] = df[col]
|
318
|
+
|
319
|
+
def read_uns(self, keys: List[str] = None) -> None:
|
320
|
+
if keys is None:
|
321
|
+
keys = list(self.uns.keys())
|
322
|
+
|
323
|
+
for key in keys:
|
324
|
+
existing_keys = self.uns.keys()
|
325
|
+
if key in existing_keys:
|
326
|
+
source = self._file[f"uns/{key}"]
|
327
|
+
self.uns[key] = read_elem(source)
|
328
|
+
|
329
|
+
def _link_layers(self) -> None:
|
330
|
+
if self._layers is None:
|
331
|
+
self._layers = CapAnnDataDict()
|
332
|
+
if "layers" in self._file.keys():
|
333
|
+
self._link_array_mapping(cap_dict=self._layers, key="layers")
|
334
|
+
|
335
|
+
def _link_obsm(self) -> None:
|
336
|
+
key = "obsm"
|
337
|
+
if self._obsm is None:
|
338
|
+
self._obsm = CapAnnDataDict()
|
339
|
+
if key in self._file.keys():
|
340
|
+
self._link_array_mapping(cap_dict=self._obsm, key=key)
|
341
|
+
|
342
|
+
def _link_varm(self) -> None:
|
343
|
+
key = "varm"
|
344
|
+
if self._varm is None:
|
345
|
+
self._varm = CapAnnDataDict()
|
346
|
+
if key in self._file.keys():
|
347
|
+
self._link_array_mapping(cap_dict=self._varm, key=key)
|
348
|
+
|
349
|
+
def _link_obsp(self):
|
350
|
+
key = "obsp"
|
351
|
+
if self._obsp is None:
|
352
|
+
self._obsp = CapAnnDataDict()
|
353
|
+
|
354
|
+
if key in self._file.keys():
|
355
|
+
self._link_array_mapping(cap_dict=self._obsp, key=key)
|
356
|
+
|
357
|
+
def _link_varp(self):
|
358
|
+
key = "varp"
|
359
|
+
if self._varp is None:
|
360
|
+
self._varp = CapAnnDataDict()
|
361
|
+
|
362
|
+
if key in self._file.keys():
|
363
|
+
self._link_array_mapping(cap_dict=self._varp, key=key)
|
364
|
+
|
365
|
+
def obsm_keys(self) -> List[str]:
|
366
|
+
return list(self.obsm.keys())
|
367
|
+
|
368
|
+
def obs_keys(self) -> List[str]:
|
369
|
+
return self.obs.column_order.tolist()
|
370
|
+
|
371
|
+
def var_keys(self) -> List[str]:
|
372
|
+
return self.var.column_order.tolist()
|
373
|
+
|
374
|
+
def overwrite(self, fields: List[str] = None, compression: str = "lzf") -> None:
|
375
|
+
field_to_entity = {
|
376
|
+
"obs": self.obs,
|
377
|
+
"var": self.var,
|
378
|
+
"raw.var": self.raw.var if self.raw is not None else None,
|
379
|
+
"uns": self.uns,
|
380
|
+
"layers": self.layers,
|
381
|
+
"obsm": self.obsm,
|
382
|
+
"varm": self.varm,
|
383
|
+
"obsp": self.obsp,
|
384
|
+
"varp": self.varp,
|
385
|
+
}
|
386
|
+
|
387
|
+
if fields is None:
|
388
|
+
fields = list(field_to_entity.keys())
|
389
|
+
else:
|
390
|
+
for f in fields:
|
391
|
+
if f not in field_to_entity.keys():
|
392
|
+
raise KeyError(
|
393
|
+
f"The field {f} is not supported! The list of supported fields are equal to supported "
|
394
|
+
f"attributes of the CapAnnData class: obs, var, raw.var and uns."
|
395
|
+
)
|
396
|
+
|
397
|
+
for key in ["obs", "var", "raw.var"]:
|
398
|
+
if key in fields:
|
399
|
+
entity: CapAnnDataDF = field_to_entity[key]
|
400
|
+
if entity is None:
|
401
|
+
continue
|
402
|
+
|
403
|
+
key = key.replace(".", "/") if key == "raw.var" else key
|
404
|
+
|
405
|
+
for col in entity.columns:
|
406
|
+
self._write_elem(
|
407
|
+
f"{key}/{col}", entity[col].values, compression=compression
|
408
|
+
)
|
409
|
+
|
410
|
+
column_order = entity.column_order
|
411
|
+
if (
|
412
|
+
column_order.size == 0
|
413
|
+
): # Refs https://github.com/cellannotation/cap-anndata/issues/6
|
414
|
+
column_order = np.array([], dtype=np.float64)
|
415
|
+
self._file[key].attrs["column-order"] = column_order
|
416
|
+
|
417
|
+
if "uns" in fields:
|
418
|
+
for key in self.uns.keys():
|
419
|
+
if self.uns[key] is not NotLinkedObject:
|
420
|
+
dest = f"uns/{key}"
|
421
|
+
self._write_elem(dest, self.uns[key], compression=compression)
|
422
|
+
for key in self.uns.keys_to_remove:
|
423
|
+
del self._file[f"uns/{key}"]
|
424
|
+
|
425
|
+
for field in ["layers", "obsm", "varm", "obsp", "varp"]:
|
426
|
+
if field in fields:
|
427
|
+
for key in field_to_entity[field].keys_to_remove:
|
428
|
+
del self._file[f"{field}/{key}"]
|
429
|
+
|
430
|
+
def create_layer(
|
431
|
+
self,
|
432
|
+
name: str,
|
433
|
+
matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
|
434
|
+
matrix_shape: Union[tuple[int, int], None] = None,
|
435
|
+
data_dtype: Union[np.dtype, None] = None,
|
436
|
+
format: Union[str, None] = None,
|
437
|
+
compression: str = "lzf",
|
438
|
+
) -> None:
|
439
|
+
"""
|
440
|
+
The empty layer will be created in the case of `matrix` is None.
|
441
|
+
"""
|
442
|
+
self._create_new_matrix_in_field(
|
443
|
+
field="layers",
|
444
|
+
name=name,
|
445
|
+
matrix=matrix,
|
446
|
+
matrix_shape=matrix_shape,
|
447
|
+
data_dtype=data_dtype,
|
448
|
+
format=format,
|
449
|
+
compression=compression,
|
450
|
+
)
|
451
|
+
self._link_layers()
|
452
|
+
|
453
|
+
def create_obsm(
|
454
|
+
self,
|
455
|
+
name: str,
|
456
|
+
matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
|
457
|
+
matrix_shape: Union[tuple[int, int], None] = None,
|
458
|
+
data_dtype: Union[np.dtype, None] = None,
|
459
|
+
format: Union[str, None] = None,
|
460
|
+
compression: str = "lzf",
|
461
|
+
) -> None:
|
462
|
+
self._create_new_matrix_in_field(
|
463
|
+
field="obsm",
|
464
|
+
name=name,
|
465
|
+
matrix=matrix,
|
466
|
+
matrix_shape=matrix_shape,
|
467
|
+
data_dtype=data_dtype,
|
468
|
+
format=format,
|
469
|
+
compression=compression,
|
470
|
+
)
|
471
|
+
self._link_obsm()
|
472
|
+
|
473
|
+
def create_varm(
|
474
|
+
self,
|
475
|
+
name: str,
|
476
|
+
matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
|
477
|
+
matrix_shape: Union[tuple[int, int], None] = None,
|
478
|
+
data_dtype: Union[np.dtype, None] = None,
|
479
|
+
format: Union[str, None] = None,
|
480
|
+
compression: str = "lzf",
|
481
|
+
) -> None:
|
482
|
+
self._create_new_matrix_in_field(
|
483
|
+
field="varm",
|
484
|
+
name=name,
|
485
|
+
matrix=matrix,
|
486
|
+
matrix_shape=matrix_shape,
|
487
|
+
data_dtype=data_dtype,
|
488
|
+
format=format,
|
489
|
+
compression=compression,
|
490
|
+
)
|
491
|
+
self._link_varm()
|
492
|
+
|
493
|
+
def create_obsp(
|
494
|
+
self,
|
495
|
+
name: str,
|
496
|
+
matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
|
497
|
+
matrix_shape: Union[tuple[int, int], None] = None,
|
498
|
+
data_dtype: Union[np.dtype, None] = None,
|
499
|
+
format: Union[str, None] = None,
|
500
|
+
compression: str = "lzf",
|
501
|
+
) -> None:
|
502
|
+
self._create_new_matrix_in_field(
|
503
|
+
field="obsp",
|
504
|
+
name=name,
|
505
|
+
matrix=matrix,
|
506
|
+
matrix_shape=matrix_shape,
|
507
|
+
data_dtype=data_dtype,
|
508
|
+
format=format,
|
509
|
+
compression=compression,
|
510
|
+
)
|
511
|
+
self._link_obsp()
|
512
|
+
|
513
|
+
def create_varp(
|
514
|
+
self,
|
515
|
+
name: str,
|
516
|
+
matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
|
517
|
+
matrix_shape: Union[tuple[int, int], None] = None,
|
518
|
+
data_dtype: Union[np.dtype, None] = None,
|
519
|
+
format: Union[str, None] = None,
|
520
|
+
compression: str = "lzf",
|
521
|
+
) -> None:
|
522
|
+
|
523
|
+
self._create_new_matrix_in_field(
|
524
|
+
field="varp",
|
525
|
+
name=name,
|
526
|
+
matrix=matrix,
|
527
|
+
matrix_shape=matrix_shape,
|
528
|
+
data_dtype=data_dtype,
|
529
|
+
format=format,
|
530
|
+
compression=compression,
|
531
|
+
)
|
532
|
+
self._link_varp()
|
533
|
+
|
534
|
+
def _create_new_matrix_in_field(self, field, name, **kwargs):
|
535
|
+
"""**kwargs: matrix, matrix_shape, data_dtype, format, compression"""
|
536
|
+
dest = f"{field}/{name}"
|
537
|
+
field_entity = getattr(self, field)
|
538
|
+
if name in field_entity.keys():
|
539
|
+
raise ValueError(
|
540
|
+
f"Please explicitly remove the existing '{name}' entity from {field} "
|
541
|
+
f"before creating a new one!"
|
542
|
+
)
|
543
|
+
if field not in self._file.keys():
|
544
|
+
self._file.create_group(field)
|
545
|
+
self._create_new_matrix(dest=dest, **kwargs)
|
546
|
+
|
547
|
+
def remove_layer(self, name: str) -> None:
|
548
|
+
del self._file[f"layers/{name}"]
|
549
|
+
self._link_layers()
|
550
|
+
|
551
|
+
def remove_obsp(self, name: str) -> None:
|
552
|
+
del self._file[f"obsp/{name}"]
|
553
|
+
self._link_obsp()
|
554
|
+
|
555
|
+
def remove_varp(self, name: str) -> None:
|
556
|
+
del self._file[f"varp/{name}"]
|
557
|
+
self._link_varp()
|
558
|
+
|
559
|
+
def remove_obsm(self, name: str) -> None:
|
560
|
+
del self._file[f"obsm/{name}"]
|
561
|
+
self._link_obsm()
|
562
|
+
|
563
|
+
def remove_varm(self, name: str) -> None:
|
564
|
+
del self._file[f"varm/{name}"]
|
565
|
+
self._link_varm()
|
566
|
+
|
567
|
+
def create_repr(self) -> str:
|
568
|
+
indent = " " * 4
|
569
|
+
s = f"CapAnnData object"
|
570
|
+
s += f"\n{indent}File: {self._file}"
|
571
|
+
s += f"\n{indent}X shape: {self.shape}"
|
572
|
+
s += f"\n{indent}Has raw X: {self.raw is not None}"
|
573
|
+
for field in ["obs", "obsm", "var", "uns", "layers"]:
|
574
|
+
if field in self._file:
|
575
|
+
in_memory = set()
|
576
|
+
if field in ["obs", "var", "uns"]:
|
577
|
+
attr = getattr(self, field)
|
578
|
+
if attr is not None:
|
579
|
+
in_memory = set(attr.keys())
|
580
|
+
keys = list(self._file[field].keys())
|
581
|
+
keys = [k for k in keys if k != "_index"]
|
582
|
+
keys = [(k if k not in in_memory else f"{k}*") for k in keys]
|
583
|
+
keys_str = str(keys).replace("*'", "'*")
|
584
|
+
s += f"\n{indent}{field}: {keys_str}"
|
585
|
+
s += f"\n{indent}Note: fields marked with * are in-memory objects."
|
586
|
+
return s
|
587
|
+
|
588
|
+
def __repr__(self) -> str:
|
589
|
+
return self.create_repr()
|
590
|
+
|
591
|
+
def __str__(self) -> str:
|
592
|
+
return self.create_repr()
|
593
|
+
|
594
|
+
def __enter__(self):
|
595
|
+
return self
|
596
|
+
|
597
|
+
def __exit__(self, *args):
|
598
|
+
if self._file is not None:
|
599
|
+
self._file.close()
|
600
|
+
logger.debug("CapAnnData closed!")
|