cap-anndata 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,600 +1,600 @@
1
- import logging
2
- import anndata as ad
3
- import numpy as np
4
- import h5py
5
- from typing import List, Union, Any, Tuple, Final
6
- import scipy.sparse as ss
7
- from packaging import version
8
-
9
- if version.parse(ad.__version__) < version.parse("0.11.0"):
10
- from anndata.experimental import sparse_dataset, read_elem, write_elem
11
- else:
12
- from anndata import sparse_dataset, read_elem, write_elem
13
-
14
- from cap_anndata import CapAnnDataDF, CapAnnDataDict
15
-
16
- logger = logging.getLogger(__name__)
17
-
18
- X_NOTATION = Union[
19
- h5py.Dataset, ad.experimental.CSRDataset, ad.experimental.CSCDataset, None
20
- ]
21
- ARRAY_MAPPING_NOTATION = CapAnnDataDict[str, X_NOTATION]
22
-
23
- NotLinkedObject: Final = "__NotLinkedObject"
24
-
25
-
26
- class BaseLayerMatrixAndDf:
27
- def __init__(self, file: h5py.File, path_to_content: str = "/") -> None:
28
- self._file = file
29
- self._path_to_content = path_to_content
30
- self._X: X_NOTATION = None
31
-
32
- @property
33
- def file(self) -> h5py.File:
34
- return self._file
35
-
36
- @property
37
- def X(self) -> X_NOTATION:
38
- if self._X is None:
39
- self._link_x()
40
- return self._X
41
-
42
- def _link_x(self) -> None:
43
- x = self._file[self._path_to_content + "X"]
44
- if isinstance(x, h5py.Dataset):
45
- # dense X
46
- self._X = x
47
- else:
48
- # sparse dataset
49
- self._X = sparse_dataset(x)
50
-
51
- @property
52
- def shape(self) -> Tuple[int, int]:
53
- if self.X is not None:
54
- shape = tuple(map(int, self.X.shape))
55
- else:
56
- shape = None
57
- return shape
58
-
59
- def _lazy_df_load(self, key: str) -> CapAnnDataDF:
60
- df = CapAnnDataDF()
61
- attribute = self._path_to_content + key
62
- column_order = self._read_attr(self._file[attribute], "column-order")
63
- df.column_order = column_order
64
- if df.column_order.dtype != object:
65
- # empty DataFrame will have column_order as float64
66
- # which leads to failure in overwrite method
67
- df.column_order = df.column_order.astype(object)
68
- return df
69
-
70
- @staticmethod
71
- def _read_attr(obj: Union[h5py.Group, h5py.Dataset], attr_name: str) -> any:
72
- attrs = dict(obj.attrs)
73
- if attr_name not in attrs.keys():
74
- raise KeyError(f"The {attr_name} doesn't exist!")
75
- return attrs[attr_name]
76
-
77
- def _read_df(self, key: str, columns: List[str]) -> CapAnnDataDF:
78
- group_path = self._path_to_content + key
79
- if group_path not in self._file.keys():
80
- raise ValueError(f"The group {group_path} doesn't exist in the file!")
81
-
82
- h5_group = self._file[group_path]
83
-
84
- column_order = self._read_attr(h5_group, "column-order")
85
-
86
- if columns is None:
87
- # read whole df
88
- df = CapAnnDataDF.from_df(read_elem(h5_group), column_order=column_order)
89
- else:
90
- if isinstance(columns, str):
91
- # single column provided instead of list
92
- columns = [columns]
93
- cols_to_read = [c for c in columns if c in column_order]
94
- df = CapAnnDataDF()
95
- df.column_order = column_order
96
- index_col = self._read_attr(h5_group, "_index")
97
- df.index = read_elem(h5_group[index_col])
98
-
99
- for col in cols_to_read:
100
- df[col] = read_elem(h5_group[col])
101
-
102
- if df.column_order.dtype != object:
103
- # empty DataFrame will have column_order as float64
104
- # which leads to failure in overwrite method
105
- df.column_order = df.column_order.astype(object)
106
- return df
107
-
108
- def _write_elem(self, dest_key: str, elem: any, compression: str) -> None:
109
- write_elem(
110
- self._file, dest_key, elem, dataset_kwargs={"compression": compression}
111
- )
112
-
113
- def _validate_cap_df(self, cap_df: CapAnnDataDF, axis: int) -> None:
114
- if not isinstance(cap_df, CapAnnDataDF):
115
- raise TypeError(
116
- f"The input should be an instance of CapAnnDataDF class but {type(cap_df)} given!"
117
- )
118
-
119
- if axis not in [0, 1]:
120
- raise ValueError("The axis should be either 0 or 1!")
121
-
122
- if cap_df.shape[0] != self.shape[axis]:
123
- items = "cells" if axis == 0 else "genes"
124
- raise ValueError(
125
- f"The number of rows in the input DataFrame should be equal to the number of {items} in the "
126
- "AnnData object!"
127
- )
128
-
129
- def _link_array_mapping(self, cap_dict: CapAnnDataDict, key: str) -> None:
130
- """Method to update given cap_dict with backed array entities from the file."""
131
- if key not in self._file.keys():
132
- raise KeyError(f"The key {key} doesn't exist in the file! Ignore linking.")
133
-
134
- group = self._file[key]
135
- if not isinstance(group, h5py.Group):
136
- raise ValueError(f"The object {key} must be a group!")
137
-
138
- for array_name in group.keys():
139
- array = group[array_name]
140
- if isinstance(array, h5py.Dataset):
141
- cap_dict[array_name] = array
142
- elif isinstance(array, h5py.Group):
143
- cap_dict[array_name] = sparse_dataset(array)
144
- else:
145
- raise ValueError(
146
- f"Can't link array in {key} due to unsupported type of object: {type(array)}"
147
- )
148
-
149
- def _create_new_matrix(
150
- self,
151
- dest: str,
152
- matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
153
- matrix_shape: Union[tuple[int, int], None] = None,
154
- data_dtype: Union[np.dtype, None] = None,
155
- format: Union[str, None] = None, # TODO: use Enum instead of str
156
- compression: str = "lzf",
157
- ) -> None:
158
- if matrix is not None:
159
- self._write_elem(dest, matrix, compression=compression)
160
- else:
161
- if format == "dense":
162
- group = self._file.create_dataset(
163
- name=dest,
164
- shape=matrix_shape,
165
- dtype=data_dtype,
166
- compression=compression,
167
- )
168
- # https://anndata.readthedocs.io/en/latest/fileformat-prose.html#dense-arrays-specification-v0-2-0
169
- group.attrs["encoding-type"] = "array"
170
- group.attrs["encoding-version"] = "0.2.0"
171
- elif format in [
172
- "csr",
173
- "csc",
174
- ]: # Based on https://github.com/appier/h5sparse/blob/master/h5sparse/h5sparse.py
175
- if data_dtype is None:
176
- data_dtype = np.float64
177
- if matrix_shape is None:
178
- matrix_shape = (0, 0)
179
- sparse_class = ss.csr_matrix if format == "csr" else ss.csc_matrix
180
- data = sparse_class(matrix_shape, dtype=data_dtype)
181
- self._write_elem(dest, data, compression=compression)
182
- else:
183
- raise NotImplementedError(
184
- f"Format must be 'dense', 'csr' or 'csc' but {format} given!"
185
- )
186
-
187
-
188
- class RawLayer(BaseLayerMatrixAndDf):
189
- def __init__(self, h5_file: h5py.File):
190
- super().__init__(h5_file, path_to_content="/raw/")
191
- self._var: CapAnnDataDF = None
192
-
193
- @property
194
- def var(self) -> CapAnnDataDF:
195
- if self._var is None:
196
- self._var = self._lazy_df_load("var")
197
- return self._var
198
-
199
- @var.setter
200
- def var(self, cap_df: CapAnnDataDF) -> None:
201
- self._validate_cap_df(cap_df, axis=1)
202
- self._var = cap_df
203
-
204
- def read_var(self, columns: List[str] = None, reset: bool = False) -> None:
205
- df = self._read_df(key="var", columns=columns)
206
- if self.var.empty or reset:
207
- self._var = df
208
- else:
209
- for col in df.columns:
210
- self._var[col] = df[col]
211
-
212
-
213
- class CapAnnData(BaseLayerMatrixAndDf):
214
- def __init__(self, h5_file: h5py.File) -> None:
215
- super().__init__(h5_file, path_to_content="/")
216
- self._file: h5py.File = h5_file
217
- self._obs: CapAnnDataDF = None
218
- self._var: CapAnnDataDF = None
219
- self._X: X_NOTATION = None
220
- self._obsm: CapAnnDataDict = None
221
- self._varm: CapAnnDataDict = None
222
- self._layers: CapAnnDataDict = None
223
- self._uns: CapAnnDataDict = None
224
- self._obsp: CapAnnDataDict = None
225
- self._varp: CapAnnDataDict = None
226
- self._raw: RawLayer = None
227
- self._shape: Tuple[int, int] = None
228
-
229
- @property
230
- def obs(self) -> CapAnnDataDF:
231
- if self._obs is None:
232
- self._obs = self._lazy_df_load("obs")
233
- return self._obs
234
-
235
- @obs.setter
236
- def obs(self, cap_df: CapAnnDataDF) -> None:
237
- self._validate_cap_df(cap_df, axis=0)
238
- self._obs = cap_df
239
-
240
- @property
241
- def var(self) -> CapAnnDataDF:
242
- if self._var is None:
243
- self._var = self._lazy_df_load("var")
244
- return self._var
245
-
246
- @var.setter
247
- def var(self, cap_df: CapAnnDataDF) -> None:
248
- self._validate_cap_df(cap_df, axis=1)
249
- self._var = cap_df
250
-
251
- @property
252
- def raw(self) -> RawLayer:
253
- if self._raw is None:
254
- if "raw" not in self._file.keys():
255
- logger.warning("Can't read raw.var since raw layer doesn't exist!")
256
- return
257
-
258
- if len(self._file["raw"].keys()) == 0:
259
- logger.warning("The raw layer is empty!")
260
- return
261
-
262
- self._raw = RawLayer(self._file)
263
- return self._raw
264
-
265
- @property
266
- def uns(self) -> CapAnnDataDict[str, Any]:
267
- if self._uns is None:
268
- self._uns = CapAnnDataDict(
269
- {k: NotLinkedObject for k in self._file["uns"].keys()}
270
- )
271
- return self._uns
272
-
273
- @property
274
- def layers(self) -> CapAnnDataDict[str, X_NOTATION]:
275
- if self._layers is None:
276
- self._link_layers()
277
- return self._layers
278
-
279
- @property
280
- def obsm(self) -> CapAnnDataDict[str, X_NOTATION]:
281
- if self._obsm is None:
282
- self._link_obsm()
283
- return self._obsm
284
-
285
- @property
286
- def varm(self) -> CapAnnDataDict[str, X_NOTATION]:
287
- if self._varm is None:
288
- self._link_varm()
289
- return self._varm
290
-
291
- @property
292
- def obsp(self) -> CapAnnDataDict[str, X_NOTATION]:
293
- if self._obsp is None:
294
- self._link_obsp()
295
- return self._obsp
296
-
297
- @property
298
- def varp(self) -> CapAnnDataDict[str, X_NOTATION]:
299
- if self._varp is None:
300
- self._link_varp()
301
- return self._varp
302
-
303
- def read_obs(self, columns: List[str] = None, reset: bool = False) -> None:
304
- df = self._read_df("obs", columns=columns)
305
- if self.obs.empty or reset:
306
- self._obs = df
307
- else:
308
- for col in df.columns:
309
- self._obs[col] = df[col]
310
-
311
- def read_var(self, columns: List[str] = None, reset: bool = False) -> None:
312
- df = self._read_df("var", columns=columns)
313
- if self.var.empty or reset:
314
- self._var = df
315
- else:
316
- for col in df.columns:
317
- self._var[col] = df[col]
318
-
319
- def read_uns(self, keys: List[str] = None) -> None:
320
- if keys is None:
321
- keys = list(self.uns.keys())
322
-
323
- for key in keys:
324
- existing_keys = self.uns.keys()
325
- if key in existing_keys:
326
- source = self._file[f"uns/{key}"]
327
- self.uns[key] = read_elem(source)
328
-
329
- def _link_layers(self) -> None:
330
- if self._layers is None:
331
- self._layers = CapAnnDataDict()
332
- if "layers" in self._file.keys():
333
- self._link_array_mapping(cap_dict=self._layers, key="layers")
334
-
335
- def _link_obsm(self) -> None:
336
- key = "obsm"
337
- if self._obsm is None:
338
- self._obsm = CapAnnDataDict()
339
- if key in self._file.keys():
340
- self._link_array_mapping(cap_dict=self._obsm, key=key)
341
-
342
- def _link_varm(self) -> None:
343
- key = "varm"
344
- if self._varm is None:
345
- self._varm = CapAnnDataDict()
346
- if key in self._file.keys():
347
- self._link_array_mapping(cap_dict=self._varm, key=key)
348
-
349
- def _link_obsp(self):
350
- key = "obsp"
351
- if self._obsp is None:
352
- self._obsp = CapAnnDataDict()
353
-
354
- if key in self._file.keys():
355
- self._link_array_mapping(cap_dict=self._obsp, key=key)
356
-
357
- def _link_varp(self):
358
- key = "varp"
359
- if self._varp is None:
360
- self._varp = CapAnnDataDict()
361
-
362
- if key in self._file.keys():
363
- self._link_array_mapping(cap_dict=self._varp, key=key)
364
-
365
- def obsm_keys(self) -> List[str]:
366
- return list(self.obsm.keys())
367
-
368
- def obs_keys(self) -> List[str]:
369
- return self.obs.column_order.tolist()
370
-
371
- def var_keys(self) -> List[str]:
372
- return self.var.column_order.tolist()
373
-
374
- def overwrite(self, fields: List[str] = None, compression: str = "lzf") -> None:
375
- field_to_entity = {
376
- "obs": self.obs,
377
- "var": self.var,
378
- "raw.var": self.raw.var if self.raw is not None else None,
379
- "uns": self.uns,
380
- "layers": self.layers,
381
- "obsm": self.obsm,
382
- "varm": self.varm,
383
- "obsp": self.obsp,
384
- "varp": self.varp,
385
- }
386
-
387
- if fields is None:
388
- fields = list(field_to_entity.keys())
389
- else:
390
- for f in fields:
391
- if f not in field_to_entity.keys():
392
- raise KeyError(
393
- f"The field {f} is not supported! The list of supported fields are equal to supported "
394
- f"attributes of the CapAnnData class: obs, var, raw.var and uns."
395
- )
396
-
397
- for key in ["obs", "var", "raw.var"]:
398
- if key in fields:
399
- entity: CapAnnDataDF = field_to_entity[key]
400
- if entity is None:
401
- continue
402
-
403
- key = key.replace(".", "/") if key == "raw.var" else key
404
-
405
- for col in entity.columns:
406
- self._write_elem(
407
- f"{key}/{col}", entity[col].values, compression=compression
408
- )
409
-
410
- column_order = entity.column_order
411
- if (
412
- column_order.size == 0
413
- ): # Refs https://github.com/cellannotation/cap-anndata/issues/6
414
- column_order = np.array([], dtype=np.float64)
415
- self._file[key].attrs["column-order"] = column_order
416
-
417
- if "uns" in fields:
418
- for key in self.uns.keys():
419
- if self.uns[key] is not NotLinkedObject:
420
- dest = f"uns/{key}"
421
- self._write_elem(dest, self.uns[key], compression=compression)
422
- for key in self.uns.keys_to_remove:
423
- del self._file[f"uns/{key}"]
424
-
425
- for field in ["layers", "obsm", "varm", "obsp", "varp"]:
426
- if field in fields:
427
- for key in field_to_entity[field].keys_to_remove:
428
- del self._file[f"{field}/{key}"]
429
-
430
- def create_layer(
431
- self,
432
- name: str,
433
- matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
434
- matrix_shape: Union[tuple[int, int], None] = None,
435
- data_dtype: Union[np.dtype, None] = None,
436
- format: Union[str, None] = None,
437
- compression: str = "lzf",
438
- ) -> None:
439
- """
440
- The empty layer will be created in the case of `matrix` is None.
441
- """
442
- self._create_new_matrix_in_field(
443
- field="layers",
444
- name=name,
445
- matrix=matrix,
446
- matrix_shape=matrix_shape,
447
- data_dtype=data_dtype,
448
- format=format,
449
- compression=compression,
450
- )
451
- self._link_layers()
452
-
453
- def create_obsm(
454
- self,
455
- name: str,
456
- matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
457
- matrix_shape: Union[tuple[int, int], None] = None,
458
- data_dtype: Union[np.dtype, None] = None,
459
- format: Union[str, None] = None,
460
- compression: str = "lzf",
461
- ) -> None:
462
- self._create_new_matrix_in_field(
463
- field="obsm",
464
- name=name,
465
- matrix=matrix,
466
- matrix_shape=matrix_shape,
467
- data_dtype=data_dtype,
468
- format=format,
469
- compression=compression,
470
- )
471
- self._link_obsm()
472
-
473
- def create_varm(
474
- self,
475
- name: str,
476
- matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
477
- matrix_shape: Union[tuple[int, int], None] = None,
478
- data_dtype: Union[np.dtype, None] = None,
479
- format: Union[str, None] = None,
480
- compression: str = "lzf",
481
- ) -> None:
482
- self._create_new_matrix_in_field(
483
- field="varm",
484
- name=name,
485
- matrix=matrix,
486
- matrix_shape=matrix_shape,
487
- data_dtype=data_dtype,
488
- format=format,
489
- compression=compression,
490
- )
491
- self._link_varm()
492
-
493
- def create_obsp(
494
- self,
495
- name: str,
496
- matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
497
- matrix_shape: Union[tuple[int, int], None] = None,
498
- data_dtype: Union[np.dtype, None] = None,
499
- format: Union[str, None] = None,
500
- compression: str = "lzf",
501
- ) -> None:
502
- self._create_new_matrix_in_field(
503
- field="obsp",
504
- name=name,
505
- matrix=matrix,
506
- matrix_shape=matrix_shape,
507
- data_dtype=data_dtype,
508
- format=format,
509
- compression=compression,
510
- )
511
- self._link_obsp()
512
-
513
- def create_varp(
514
- self,
515
- name: str,
516
- matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
517
- matrix_shape: Union[tuple[int, int], None] = None,
518
- data_dtype: Union[np.dtype, None] = None,
519
- format: Union[str, None] = None,
520
- compression: str = "lzf",
521
- ) -> None:
522
-
523
- self._create_new_matrix_in_field(
524
- field="varp",
525
- name=name,
526
- matrix=matrix,
527
- matrix_shape=matrix_shape,
528
- data_dtype=data_dtype,
529
- format=format,
530
- compression=compression,
531
- )
532
- self._link_varp()
533
-
534
- def _create_new_matrix_in_field(self, field, name, **kwargs):
535
- """**kwargs: matrix, matrix_shape, data_dtype, format, compression"""
536
- dest = f"{field}/{name}"
537
- field_entity = getattr(self, field)
538
- if name in field_entity.keys():
539
- raise ValueError(
540
- f"Please explicitly remove the existing '{name}' entity from {field} "
541
- f"before creating a new one!"
542
- )
543
- if field not in self._file.keys():
544
- self._file.create_group(field)
545
- self._create_new_matrix(dest=dest, **kwargs)
546
-
547
- def remove_layer(self, name: str) -> None:
548
- del self._file[f"layers/{name}"]
549
- self._link_layers()
550
-
551
- def remove_obsp(self, name: str) -> None:
552
- del self._file[f"obsp/{name}"]
553
- self._link_obsp()
554
-
555
- def remove_varp(self, name: str) -> None:
556
- del self._file[f"varp/{name}"]
557
- self._link_varp()
558
-
559
- def remove_obsm(self, name: str) -> None:
560
- del self._file[f"obsm/{name}"]
561
- self._link_obsm()
562
-
563
- def remove_varm(self, name: str) -> None:
564
- del self._file[f"varm/{name}"]
565
- self._link_varm()
566
-
567
- def create_repr(self) -> str:
568
- indent = " " * 4
569
- s = f"CapAnnData object"
570
- s += f"\n{indent}File: {self._file}"
571
- s += f"\n{indent}X shape: {self.shape}"
572
- s += f"\n{indent}Has raw X: {self.raw is not None}"
573
- for field in ["obs", "obsm", "var", "uns", "layers"]:
574
- if field in self._file:
575
- in_memory = set()
576
- if field in ["obs", "var", "uns"]:
577
- attr = getattr(self, field)
578
- if attr is not None:
579
- in_memory = set(attr.keys())
580
- keys = list(self._file[field].keys())
581
- keys = [k for k in keys if k != "_index"]
582
- keys = [(k if k not in in_memory else f"{k}*") for k in keys]
583
- keys_str = str(keys).replace("*'", "'*")
584
- s += f"\n{indent}{field}: {keys_str}"
585
- s += f"\n{indent}Note: fields marked with * are in-memory objects."
586
- return s
587
-
588
- def __repr__(self) -> str:
589
- return self.create_repr()
590
-
591
- def __str__(self) -> str:
592
- return self.create_repr()
593
-
594
- def __enter__(self):
595
- return self
596
-
597
- def __exit__(self, *args):
598
- if self._file is not None:
599
- self._file.close()
600
- logger.debug("CapAnnData closed!")
1
+ import logging
2
+ import anndata as ad
3
+ import numpy as np
4
+ import h5py
5
+ from typing import List, Union, Any, Tuple, Final
6
+ import scipy.sparse as ss
7
+ from packaging import version
8
+
9
+ if version.parse(ad.__version__) < version.parse("0.11.0"):
10
+ from anndata.experimental import sparse_dataset, read_elem, write_elem
11
+ else:
12
+ from anndata.io import sparse_dataset, read_elem, write_elem
13
+
14
+ from cap_anndata import CapAnnDataDF, CapAnnDataDict
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ X_NOTATION = Union[
19
+ h5py.Dataset, ad.experimental.CSRDataset, ad.experimental.CSCDataset, None
20
+ ]
21
+ ARRAY_MAPPING_NOTATION = CapAnnDataDict[str, X_NOTATION]
22
+
23
+ NotLinkedObject: Final = "__NotLinkedObject"
24
+
25
+
26
+ class BaseLayerMatrixAndDf:
27
+ def __init__(self, file: h5py.File, path_to_content: str = "/") -> None:
28
+ self._file = file
29
+ self._path_to_content = path_to_content
30
+ self._X: X_NOTATION = None
31
+
32
+ @property
33
+ def file(self) -> h5py.File:
34
+ return self._file
35
+
36
+ @property
37
+ def X(self) -> X_NOTATION:
38
+ if self._X is None:
39
+ self._link_x()
40
+ return self._X
41
+
42
+ def _link_x(self) -> None:
43
+ x = self._file[self._path_to_content + "X"]
44
+ if isinstance(x, h5py.Dataset):
45
+ # dense X
46
+ self._X = x
47
+ else:
48
+ # sparse dataset
49
+ self._X = sparse_dataset(x)
50
+
51
+ @property
52
+ def shape(self) -> Tuple[int, int]:
53
+ if self.X is not None:
54
+ shape = tuple(map(int, self.X.shape))
55
+ else:
56
+ shape = None
57
+ return shape
58
+
59
+ def _lazy_df_load(self, key: str) -> CapAnnDataDF:
60
+ df = CapAnnDataDF()
61
+ attribute = self._path_to_content + key
62
+ column_order = self._read_attr(self._file[attribute], "column-order")
63
+ df.column_order = column_order
64
+ if df.column_order.dtype != object:
65
+ # empty DataFrame will have column_order as float64
66
+ # which leads to failure in overwrite method
67
+ df.column_order = df.column_order.astype(object)
68
+ return df
69
+
70
+ @staticmethod
71
+ def _read_attr(obj: Union[h5py.Group, h5py.Dataset], attr_name: str) -> any:
72
+ attrs = dict(obj.attrs)
73
+ if attr_name not in attrs.keys():
74
+ raise KeyError(f"The {attr_name} doesn't exist!")
75
+ return attrs[attr_name]
76
+
77
+ def _read_df(self, key: str, columns: List[str]) -> CapAnnDataDF:
78
+ group_path = self._path_to_content + key
79
+ if group_path not in self._file.keys():
80
+ raise ValueError(f"The group {group_path} doesn't exist in the file!")
81
+
82
+ h5_group = self._file[group_path]
83
+
84
+ column_order = self._read_attr(h5_group, "column-order")
85
+
86
+ if columns is None:
87
+ # read whole df
88
+ df = CapAnnDataDF.from_df(read_elem(h5_group), column_order=column_order)
89
+ else:
90
+ if isinstance(columns, str):
91
+ # single column provided instead of list
92
+ columns = [columns]
93
+ cols_to_read = [c for c in columns if c in column_order]
94
+ df = CapAnnDataDF()
95
+ df.column_order = column_order
96
+ index_col = self._read_attr(h5_group, "_index")
97
+ df.index = read_elem(h5_group[index_col])
98
+
99
+ for col in cols_to_read:
100
+ df[col] = read_elem(h5_group[col])
101
+
102
+ if df.column_order.dtype != object:
103
+ # empty DataFrame will have column_order as float64
104
+ # which leads to failure in overwrite method
105
+ df.column_order = df.column_order.astype(object)
106
+ return df
107
+
108
+ def _write_elem(self, dest_key: str, elem: any, compression: str) -> None:
109
+ write_elem(
110
+ self._file, dest_key, elem, dataset_kwargs={"compression": compression}
111
+ )
112
+
113
+ def _validate_cap_df(self, cap_df: CapAnnDataDF, axis: int) -> None:
114
+ if not isinstance(cap_df, CapAnnDataDF):
115
+ raise TypeError(
116
+ f"The input should be an instance of CapAnnDataDF class but {type(cap_df)} given!"
117
+ )
118
+
119
+ if axis not in [0, 1]:
120
+ raise ValueError("The axis should be either 0 or 1!")
121
+
122
+ if cap_df.shape[0] != self.shape[axis]:
123
+ items = "cells" if axis == 0 else "genes"
124
+ raise ValueError(
125
+ f"The number of rows in the input DataFrame should be equal to the number of {items} in the "
126
+ "AnnData object!"
127
+ )
128
+
129
+ def _link_array_mapping(self, cap_dict: CapAnnDataDict, key: str) -> None:
130
+ """Method to update given cap_dict with backed array entities from the file."""
131
+ if key not in self._file.keys():
132
+ raise KeyError(f"The key {key} doesn't exist in the file! Ignore linking.")
133
+
134
+ group = self._file[key]
135
+ if not isinstance(group, h5py.Group):
136
+ raise ValueError(f"The object {key} must be a group!")
137
+
138
+ for array_name in group.keys():
139
+ array = group[array_name]
140
+ if isinstance(array, h5py.Dataset):
141
+ cap_dict[array_name] = array
142
+ elif isinstance(array, h5py.Group):
143
+ cap_dict[array_name] = sparse_dataset(array)
144
+ else:
145
+ raise ValueError(
146
+ f"Can't link array in {key} due to unsupported type of object: {type(array)}"
147
+ )
148
+
149
+ def _create_new_matrix(
150
+ self,
151
+ dest: str,
152
+ matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
153
+ matrix_shape: Union[tuple[int, int], None] = None,
154
+ data_dtype: Union[np.dtype, None] = None,
155
+ format: Union[str, None] = None, # TODO: use Enum instead of str
156
+ compression: str = "lzf",
157
+ ) -> None:
158
+ if matrix is not None:
159
+ self._write_elem(dest, matrix, compression=compression)
160
+ else:
161
+ if format == "dense":
162
+ group = self._file.create_dataset(
163
+ name=dest,
164
+ shape=matrix_shape,
165
+ dtype=data_dtype,
166
+ compression=compression,
167
+ )
168
+ # https://anndata.readthedocs.io/en/latest/fileformat-prose.html#dense-arrays-specification-v0-2-0
169
+ group.attrs["encoding-type"] = "array"
170
+ group.attrs["encoding-version"] = "0.2.0"
171
+ elif format in [
172
+ "csr",
173
+ "csc",
174
+ ]: # Based on https://github.com/appier/h5sparse/blob/master/h5sparse/h5sparse.py
175
+ if data_dtype is None:
176
+ data_dtype = np.float64
177
+ if matrix_shape is None:
178
+ matrix_shape = (0, 0)
179
+ sparse_class = ss.csr_matrix if format == "csr" else ss.csc_matrix
180
+ data = sparse_class(matrix_shape, dtype=data_dtype)
181
+ self._write_elem(dest, data, compression=compression)
182
+ else:
183
+ raise NotImplementedError(
184
+ f"Format must be 'dense', 'csr' or 'csc' but {format} given!"
185
+ )
186
+
187
+
188
+ class RawLayer(BaseLayerMatrixAndDf):
189
+ def __init__(self, h5_file: h5py.File):
190
+ super().__init__(h5_file, path_to_content="/raw/")
191
+ self._var: CapAnnDataDF = None
192
+
193
+ @property
194
+ def var(self) -> CapAnnDataDF:
195
+ if self._var is None:
196
+ self._var = self._lazy_df_load("var")
197
+ return self._var
198
+
199
+ @var.setter
200
+ def var(self, cap_df: CapAnnDataDF) -> None:
201
+ self._validate_cap_df(cap_df, axis=1)
202
+ self._var = cap_df
203
+
204
+ def read_var(self, columns: List[str] = None, reset: bool = False) -> None:
205
+ df = self._read_df(key="var", columns=columns)
206
+ if self.var.empty or reset:
207
+ self._var = df
208
+ else:
209
+ for col in df.columns:
210
+ self._var[col] = df[col]
211
+
212
+
213
+ class CapAnnData(BaseLayerMatrixAndDf):
214
+ def __init__(self, h5_file: h5py.File) -> None:
215
+ super().__init__(h5_file, path_to_content="/")
216
+ self._file: h5py.File = h5_file
217
+ self._obs: CapAnnDataDF = None
218
+ self._var: CapAnnDataDF = None
219
+ self._X: X_NOTATION = None
220
+ self._obsm: CapAnnDataDict = None
221
+ self._varm: CapAnnDataDict = None
222
+ self._layers: CapAnnDataDict = None
223
+ self._uns: CapAnnDataDict = None
224
+ self._obsp: CapAnnDataDict = None
225
+ self._varp: CapAnnDataDict = None
226
+ self._raw: RawLayer = None
227
+ self._shape: Tuple[int, int] = None
228
+
229
+ @property
230
+ def obs(self) -> CapAnnDataDF:
231
+ if self._obs is None:
232
+ self._obs = self._lazy_df_load("obs")
233
+ return self._obs
234
+
235
+ @obs.setter
236
+ def obs(self, cap_df: CapAnnDataDF) -> None:
237
+ self._validate_cap_df(cap_df, axis=0)
238
+ self._obs = cap_df
239
+
240
+ @property
241
+ def var(self) -> CapAnnDataDF:
242
+ if self._var is None:
243
+ self._var = self._lazy_df_load("var")
244
+ return self._var
245
+
246
+ @var.setter
247
+ def var(self, cap_df: CapAnnDataDF) -> None:
248
+ self._validate_cap_df(cap_df, axis=1)
249
+ self._var = cap_df
250
+
251
+ @property
252
+ def raw(self) -> RawLayer:
253
+ if self._raw is None:
254
+ if "raw" not in self._file.keys():
255
+ logger.warning("Can't read raw.var since raw layer doesn't exist!")
256
+ return
257
+
258
+ if len(self._file["raw"].keys()) == 0:
259
+ logger.warning("The raw layer is empty!")
260
+ return
261
+
262
+ self._raw = RawLayer(self._file)
263
+ return self._raw
264
+
265
+ @property
266
+ def uns(self) -> CapAnnDataDict[str, Any]:
267
+ if self._uns is None:
268
+ self._uns = CapAnnDataDict(
269
+ {k: NotLinkedObject for k in self._file["uns"].keys()}
270
+ )
271
+ return self._uns
272
+
273
+ @property
274
+ def layers(self) -> CapAnnDataDict[str, X_NOTATION]:
275
+ if self._layers is None:
276
+ self._link_layers()
277
+ return self._layers
278
+
279
+ @property
280
+ def obsm(self) -> CapAnnDataDict[str, X_NOTATION]:
281
+ if self._obsm is None:
282
+ self._link_obsm()
283
+ return self._obsm
284
+
285
+ @property
286
+ def varm(self) -> CapAnnDataDict[str, X_NOTATION]:
287
+ if self._varm is None:
288
+ self._link_varm()
289
+ return self._varm
290
+
291
+ @property
292
+ def obsp(self) -> CapAnnDataDict[str, X_NOTATION]:
293
+ if self._obsp is None:
294
+ self._link_obsp()
295
+ return self._obsp
296
+
297
+ @property
298
+ def varp(self) -> CapAnnDataDict[str, X_NOTATION]:
299
+ if self._varp is None:
300
+ self._link_varp()
301
+ return self._varp
302
+
303
+ def read_obs(self, columns: List[str] = None, reset: bool = False) -> None:
304
+ df = self._read_df("obs", columns=columns)
305
+ if self.obs.empty or reset:
306
+ self._obs = df
307
+ else:
308
+ for col in df.columns:
309
+ self._obs[col] = df[col]
310
+
311
+ def read_var(self, columns: List[str] = None, reset: bool = False) -> None:
312
+ df = self._read_df("var", columns=columns)
313
+ if self.var.empty or reset:
314
+ self._var = df
315
+ else:
316
+ for col in df.columns:
317
+ self._var[col] = df[col]
318
+
319
+ def read_uns(self, keys: List[str] = None) -> None:
320
+ if keys is None:
321
+ keys = list(self.uns.keys())
322
+
323
+ for key in keys:
324
+ existing_keys = self.uns.keys()
325
+ if key in existing_keys:
326
+ source = self._file[f"uns/{key}"]
327
+ self.uns[key] = read_elem(source)
328
+
329
+ def _link_layers(self) -> None:
330
+ if self._layers is None:
331
+ self._layers = CapAnnDataDict()
332
+ if "layers" in self._file.keys():
333
+ self._link_array_mapping(cap_dict=self._layers, key="layers")
334
+
335
+ def _link_obsm(self) -> None:
336
+ key = "obsm"
337
+ if self._obsm is None:
338
+ self._obsm = CapAnnDataDict()
339
+ if key in self._file.keys():
340
+ self._link_array_mapping(cap_dict=self._obsm, key=key)
341
+
342
+ def _link_varm(self) -> None:
343
+ key = "varm"
344
+ if self._varm is None:
345
+ self._varm = CapAnnDataDict()
346
+ if key in self._file.keys():
347
+ self._link_array_mapping(cap_dict=self._varm, key=key)
348
+
349
+ def _link_obsp(self):
350
+ key = "obsp"
351
+ if self._obsp is None:
352
+ self._obsp = CapAnnDataDict()
353
+
354
+ if key in self._file.keys():
355
+ self._link_array_mapping(cap_dict=self._obsp, key=key)
356
+
357
+ def _link_varp(self):
358
+ key = "varp"
359
+ if self._varp is None:
360
+ self._varp = CapAnnDataDict()
361
+
362
+ if key in self._file.keys():
363
+ self._link_array_mapping(cap_dict=self._varp, key=key)
364
+
365
+ def obsm_keys(self) -> List[str]:
366
+ return list(self.obsm.keys())
367
+
368
+ def obs_keys(self) -> List[str]:
369
+ return self.obs.column_order.tolist()
370
+
371
+ def var_keys(self) -> List[str]:
372
+ return self.var.column_order.tolist()
373
+
374
+ def overwrite(self, fields: List[str] = None, compression: str = "lzf") -> None:
375
+ field_to_entity = {
376
+ "obs": self.obs,
377
+ "var": self.var,
378
+ "raw.var": self.raw.var if self.raw is not None else None,
379
+ "uns": self.uns,
380
+ "layers": self.layers,
381
+ "obsm": self.obsm,
382
+ "varm": self.varm,
383
+ "obsp": self.obsp,
384
+ "varp": self.varp,
385
+ }
386
+
387
+ if fields is None:
388
+ fields = list(field_to_entity.keys())
389
+ else:
390
+ for f in fields:
391
+ if f not in field_to_entity.keys():
392
+ raise KeyError(
393
+ f"The field {f} is not supported! The list of supported fields are equal to supported "
394
+ f"attributes of the CapAnnData class: obs, var, raw.var and uns."
395
+ )
396
+
397
+ for key in ["obs", "var", "raw.var"]:
398
+ if key in fields:
399
+ entity: CapAnnDataDF = field_to_entity[key]
400
+ if entity is None:
401
+ continue
402
+
403
+ key = key.replace(".", "/") if key == "raw.var" else key
404
+
405
+ for col in entity.columns:
406
+ self._write_elem(
407
+ f"{key}/{col}", entity[col].values, compression=compression
408
+ )
409
+
410
+ column_order = entity.column_order
411
+ if (
412
+ column_order.size == 0
413
+ ): # Refs https://github.com/cellannotation/cap-anndata/issues/6
414
+ column_order = np.array([], dtype=np.float64)
415
+ self._file[key].attrs["column-order"] = column_order
416
+
417
+ if "uns" in fields:
418
+ for key in self.uns.keys():
419
+ if self.uns[key] is not NotLinkedObject:
420
+ dest = f"uns/{key}"
421
+ self._write_elem(dest, self.uns[key], compression=compression)
422
+ for key in self.uns.keys_to_remove:
423
+ del self._file[f"uns/{key}"]
424
+
425
+ for field in ["layers", "obsm", "varm", "obsp", "varp"]:
426
+ if field in fields:
427
+ for key in field_to_entity[field].keys_to_remove:
428
+ del self._file[f"{field}/{key}"]
429
+
430
+ def create_layer(
431
+ self,
432
+ name: str,
433
+ matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
434
+ matrix_shape: Union[tuple[int, int], None] = None,
435
+ data_dtype: Union[np.dtype, None] = None,
436
+ format: Union[str, None] = None,
437
+ compression: str = "lzf",
438
+ ) -> None:
439
+ """
440
+ The empty layer will be created in the case of `matrix` is None.
441
+ """
442
+ self._create_new_matrix_in_field(
443
+ field="layers",
444
+ name=name,
445
+ matrix=matrix,
446
+ matrix_shape=matrix_shape,
447
+ data_dtype=data_dtype,
448
+ format=format,
449
+ compression=compression,
450
+ )
451
+ self._link_layers()
452
+
453
+ def create_obsm(
454
+ self,
455
+ name: str,
456
+ matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
457
+ matrix_shape: Union[tuple[int, int], None] = None,
458
+ data_dtype: Union[np.dtype, None] = None,
459
+ format: Union[str, None] = None,
460
+ compression: str = "lzf",
461
+ ) -> None:
462
+ self._create_new_matrix_in_field(
463
+ field="obsm",
464
+ name=name,
465
+ matrix=matrix,
466
+ matrix_shape=matrix_shape,
467
+ data_dtype=data_dtype,
468
+ format=format,
469
+ compression=compression,
470
+ )
471
+ self._link_obsm()
472
+
473
+ def create_varm(
474
+ self,
475
+ name: str,
476
+ matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
477
+ matrix_shape: Union[tuple[int, int], None] = None,
478
+ data_dtype: Union[np.dtype, None] = None,
479
+ format: Union[str, None] = None,
480
+ compression: str = "lzf",
481
+ ) -> None:
482
+ self._create_new_matrix_in_field(
483
+ field="varm",
484
+ name=name,
485
+ matrix=matrix,
486
+ matrix_shape=matrix_shape,
487
+ data_dtype=data_dtype,
488
+ format=format,
489
+ compression=compression,
490
+ )
491
+ self._link_varm()
492
+
493
+ def create_obsp(
494
+ self,
495
+ name: str,
496
+ matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
497
+ matrix_shape: Union[tuple[int, int], None] = None,
498
+ data_dtype: Union[np.dtype, None] = None,
499
+ format: Union[str, None] = None,
500
+ compression: str = "lzf",
501
+ ) -> None:
502
+ self._create_new_matrix_in_field(
503
+ field="obsp",
504
+ name=name,
505
+ matrix=matrix,
506
+ matrix_shape=matrix_shape,
507
+ data_dtype=data_dtype,
508
+ format=format,
509
+ compression=compression,
510
+ )
511
+ self._link_obsp()
512
+
513
+ def create_varp(
514
+ self,
515
+ name: str,
516
+ matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
517
+ matrix_shape: Union[tuple[int, int], None] = None,
518
+ data_dtype: Union[np.dtype, None] = None,
519
+ format: Union[str, None] = None,
520
+ compression: str = "lzf",
521
+ ) -> None:
522
+
523
+ self._create_new_matrix_in_field(
524
+ field="varp",
525
+ name=name,
526
+ matrix=matrix,
527
+ matrix_shape=matrix_shape,
528
+ data_dtype=data_dtype,
529
+ format=format,
530
+ compression=compression,
531
+ )
532
+ self._link_varp()
533
+
534
+ def _create_new_matrix_in_field(self, field, name, **kwargs):
535
+ """**kwargs: matrix, matrix_shape, data_dtype, format, compression"""
536
+ dest = f"{field}/{name}"
537
+ field_entity = getattr(self, field)
538
+ if name in field_entity.keys():
539
+ raise ValueError(
540
+ f"Please explicitly remove the existing '{name}' entity from {field} "
541
+ f"before creating a new one!"
542
+ )
543
+ if field not in self._file.keys():
544
+ self._file.create_group(field)
545
+ self._create_new_matrix(dest=dest, **kwargs)
546
+
547
+ def remove_layer(self, name: str) -> None:
548
+ del self._file[f"layers/{name}"]
549
+ self._link_layers()
550
+
551
+ def remove_obsp(self, name: str) -> None:
552
+ del self._file[f"obsp/{name}"]
553
+ self._link_obsp()
554
+
555
+ def remove_varp(self, name: str) -> None:
556
+ del self._file[f"varp/{name}"]
557
+ self._link_varp()
558
+
559
+ def remove_obsm(self, name: str) -> None:
560
+ del self._file[f"obsm/{name}"]
561
+ self._link_obsm()
562
+
563
+ def remove_varm(self, name: str) -> None:
564
+ del self._file[f"varm/{name}"]
565
+ self._link_varm()
566
+
567
+ def create_repr(self) -> str:
568
+ indent = " " * 4
569
+ s = f"CapAnnData object"
570
+ s += f"\n{indent}File: {self._file}"
571
+ s += f"\n{indent}X shape: {self.shape}"
572
+ s += f"\n{indent}Has raw X: {self.raw is not None}"
573
+ for field in ["obs", "obsm", "var", "uns", "layers"]:
574
+ if field in self._file:
575
+ in_memory = set()
576
+ if field in ["obs", "var", "uns"]:
577
+ attr = getattr(self, field)
578
+ if attr is not None:
579
+ in_memory = set(attr.keys())
580
+ keys = list(self._file[field].keys())
581
+ keys = [k for k in keys if k != "_index"]
582
+ keys = [(k if k not in in_memory else f"{k}*") for k in keys]
583
+ keys_str = str(keys).replace("*'", "'*")
584
+ s += f"\n{indent}{field}: {keys_str}"
585
+ s += f"\n{indent}Note: fields marked with * are in-memory objects."
586
+ return s
587
+
588
+ def __repr__(self) -> str:
589
+ return self.create_repr()
590
+
591
+ def __str__(self) -> str:
592
+ return self.create_repr()
593
+
594
+ def __enter__(self):
595
+ return self
596
+
597
+ def __exit__(self, *args):
598
+ if self._file is not None:
599
+ self._file.close()
600
+ logger.debug("CapAnnData closed!")