cap-anndata 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,600 +1,625 @@
1
- import logging
2
- import anndata as ad
3
- import numpy as np
4
- import h5py
5
- from typing import List, Union, Any, Tuple, Final
6
- import scipy.sparse as ss
7
- from packaging import version
8
-
9
- if version.parse(ad.__version__) < version.parse("0.11.0"):
10
- from anndata.experimental import sparse_dataset, read_elem, write_elem
11
- else:
12
- from anndata import sparse_dataset, read_elem, write_elem
13
-
14
- from cap_anndata import CapAnnDataDF, CapAnnDataDict
15
-
16
- logger = logging.getLogger(__name__)
17
-
18
- X_NOTATION = Union[
19
- h5py.Dataset, ad.experimental.CSRDataset, ad.experimental.CSCDataset, None
20
- ]
21
- ARRAY_MAPPING_NOTATION = CapAnnDataDict[str, X_NOTATION]
22
-
23
- NotLinkedObject: Final = "__NotLinkedObject"
24
-
25
-
26
- class BaseLayerMatrixAndDf:
27
- def __init__(self, file: h5py.File, path_to_content: str = "/") -> None:
28
- self._file = file
29
- self._path_to_content = path_to_content
30
- self._X: X_NOTATION = None
31
-
32
- @property
33
- def file(self) -> h5py.File:
34
- return self._file
35
-
36
- @property
37
- def X(self) -> X_NOTATION:
38
- if self._X is None:
39
- self._link_x()
40
- return self._X
41
-
42
- def _link_x(self) -> None:
43
- x = self._file[self._path_to_content + "X"]
44
- if isinstance(x, h5py.Dataset):
45
- # dense X
46
- self._X = x
47
- else:
48
- # sparse dataset
49
- self._X = sparse_dataset(x)
50
-
51
- @property
52
- def shape(self) -> Tuple[int, int]:
53
- if self.X is not None:
54
- shape = tuple(map(int, self.X.shape))
55
- else:
56
- shape = None
57
- return shape
58
-
59
- def _lazy_df_load(self, key: str) -> CapAnnDataDF:
60
- df = CapAnnDataDF()
61
- attribute = self._path_to_content + key
62
- column_order = self._read_attr(self._file[attribute], "column-order")
63
- df.column_order = column_order
64
- if df.column_order.dtype != object:
65
- # empty DataFrame will have column_order as float64
66
- # which leads to failure in overwrite method
67
- df.column_order = df.column_order.astype(object)
68
- return df
69
-
70
- @staticmethod
71
- def _read_attr(obj: Union[h5py.Group, h5py.Dataset], attr_name: str) -> any:
72
- attrs = dict(obj.attrs)
73
- if attr_name not in attrs.keys():
74
- raise KeyError(f"The {attr_name} doesn't exist!")
75
- return attrs[attr_name]
76
-
77
- def _read_df(self, key: str, columns: List[str]) -> CapAnnDataDF:
78
- group_path = self._path_to_content + key
79
- if group_path not in self._file.keys():
80
- raise ValueError(f"The group {group_path} doesn't exist in the file!")
81
-
82
- h5_group = self._file[group_path]
83
-
84
- column_order = self._read_attr(h5_group, "column-order")
85
-
86
- if columns is None:
87
- # read whole df
88
- df = CapAnnDataDF.from_df(read_elem(h5_group), column_order=column_order)
89
- else:
90
- if isinstance(columns, str):
91
- # single column provided instead of list
92
- columns = [columns]
93
- cols_to_read = [c for c in columns if c in column_order]
94
- df = CapAnnDataDF()
95
- df.column_order = column_order
96
- index_col = self._read_attr(h5_group, "_index")
97
- df.index = read_elem(h5_group[index_col])
98
-
99
- for col in cols_to_read:
100
- df[col] = read_elem(h5_group[col])
101
-
102
- if df.column_order.dtype != object:
103
- # empty DataFrame will have column_order as float64
104
- # which leads to failure in overwrite method
105
- df.column_order = df.column_order.astype(object)
106
- return df
107
-
108
- def _write_elem(self, dest_key: str, elem: any, compression: str) -> None:
109
- write_elem(
110
- self._file, dest_key, elem, dataset_kwargs={"compression": compression}
111
- )
112
-
113
- def _validate_cap_df(self, cap_df: CapAnnDataDF, axis: int) -> None:
114
- if not isinstance(cap_df, CapAnnDataDF):
115
- raise TypeError(
116
- f"The input should be an instance of CapAnnDataDF class but {type(cap_df)} given!"
117
- )
118
-
119
- if axis not in [0, 1]:
120
- raise ValueError("The axis should be either 0 or 1!")
121
-
122
- if cap_df.shape[0] != self.shape[axis]:
123
- items = "cells" if axis == 0 else "genes"
124
- raise ValueError(
125
- f"The number of rows in the input DataFrame should be equal to the number of {items} in the "
126
- "AnnData object!"
127
- )
128
-
129
- def _link_array_mapping(self, cap_dict: CapAnnDataDict, key: str) -> None:
130
- """Method to update given cap_dict with backed array entities from the file."""
131
- if key not in self._file.keys():
132
- raise KeyError(f"The key {key} doesn't exist in the file! Ignore linking.")
133
-
134
- group = self._file[key]
135
- if not isinstance(group, h5py.Group):
136
- raise ValueError(f"The object {key} must be a group!")
137
-
138
- for array_name in group.keys():
139
- array = group[array_name]
140
- if isinstance(array, h5py.Dataset):
141
- cap_dict[array_name] = array
142
- elif isinstance(array, h5py.Group):
143
- cap_dict[array_name] = sparse_dataset(array)
144
- else:
145
- raise ValueError(
146
- f"Can't link array in {key} due to unsupported type of object: {type(array)}"
147
- )
148
-
149
- def _create_new_matrix(
150
- self,
151
- dest: str,
152
- matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
153
- matrix_shape: Union[tuple[int, int], None] = None,
154
- data_dtype: Union[np.dtype, None] = None,
155
- format: Union[str, None] = None, # TODO: use Enum instead of str
156
- compression: str = "lzf",
157
- ) -> None:
158
- if matrix is not None:
159
- self._write_elem(dest, matrix, compression=compression)
160
- else:
161
- if format == "dense":
162
- group = self._file.create_dataset(
163
- name=dest,
164
- shape=matrix_shape,
165
- dtype=data_dtype,
166
- compression=compression,
167
- )
168
- # https://anndata.readthedocs.io/en/latest/fileformat-prose.html#dense-arrays-specification-v0-2-0
169
- group.attrs["encoding-type"] = "array"
170
- group.attrs["encoding-version"] = "0.2.0"
171
- elif format in [
172
- "csr",
173
- "csc",
174
- ]: # Based on https://github.com/appier/h5sparse/blob/master/h5sparse/h5sparse.py
175
- if data_dtype is None:
176
- data_dtype = np.float64
177
- if matrix_shape is None:
178
- matrix_shape = (0, 0)
179
- sparse_class = ss.csr_matrix if format == "csr" else ss.csc_matrix
180
- data = sparse_class(matrix_shape, dtype=data_dtype)
181
- self._write_elem(dest, data, compression=compression)
182
- else:
183
- raise NotImplementedError(
184
- f"Format must be 'dense', 'csr' or 'csc' but {format} given!"
185
- )
186
-
187
-
188
- class RawLayer(BaseLayerMatrixAndDf):
189
- def __init__(self, h5_file: h5py.File):
190
- super().__init__(h5_file, path_to_content="/raw/")
191
- self._var: CapAnnDataDF = None
192
-
193
- @property
194
- def var(self) -> CapAnnDataDF:
195
- if self._var is None:
196
- self._var = self._lazy_df_load("var")
197
- return self._var
198
-
199
- @var.setter
200
- def var(self, cap_df: CapAnnDataDF) -> None:
201
- self._validate_cap_df(cap_df, axis=1)
202
- self._var = cap_df
203
-
204
- def read_var(self, columns: List[str] = None, reset: bool = False) -> None:
205
- df = self._read_df(key="var", columns=columns)
206
- if self.var.empty or reset:
207
- self._var = df
208
- else:
209
- for col in df.columns:
210
- self._var[col] = df[col]
211
-
212
-
213
- class CapAnnData(BaseLayerMatrixAndDf):
214
- def __init__(self, h5_file: h5py.File) -> None:
215
- super().__init__(h5_file, path_to_content="/")
216
- self._file: h5py.File = h5_file
217
- self._obs: CapAnnDataDF = None
218
- self._var: CapAnnDataDF = None
219
- self._X: X_NOTATION = None
220
- self._obsm: CapAnnDataDict = None
221
- self._varm: CapAnnDataDict = None
222
- self._layers: CapAnnDataDict = None
223
- self._uns: CapAnnDataDict = None
224
- self._obsp: CapAnnDataDict = None
225
- self._varp: CapAnnDataDict = None
226
- self._raw: RawLayer = None
227
- self._shape: Tuple[int, int] = None
228
-
229
- @property
230
- def obs(self) -> CapAnnDataDF:
231
- if self._obs is None:
232
- self._obs = self._lazy_df_load("obs")
233
- return self._obs
234
-
235
- @obs.setter
236
- def obs(self, cap_df: CapAnnDataDF) -> None:
237
- self._validate_cap_df(cap_df, axis=0)
238
- self._obs = cap_df
239
-
240
- @property
241
- def var(self) -> CapAnnDataDF:
242
- if self._var is None:
243
- self._var = self._lazy_df_load("var")
244
- return self._var
245
-
246
- @var.setter
247
- def var(self, cap_df: CapAnnDataDF) -> None:
248
- self._validate_cap_df(cap_df, axis=1)
249
- self._var = cap_df
250
-
251
- @property
252
- def raw(self) -> RawLayer:
253
- if self._raw is None:
254
- if "raw" not in self._file.keys():
255
- logger.warning("Can't read raw.var since raw layer doesn't exist!")
256
- return
257
-
258
- if len(self._file["raw"].keys()) == 0:
259
- logger.warning("The raw layer is empty!")
260
- return
261
-
262
- self._raw = RawLayer(self._file)
263
- return self._raw
264
-
265
- @property
266
- def uns(self) -> CapAnnDataDict[str, Any]:
267
- if self._uns is None:
268
- self._uns = CapAnnDataDict(
269
- {k: NotLinkedObject for k in self._file["uns"].keys()}
270
- )
271
- return self._uns
272
-
273
- @property
274
- def layers(self) -> CapAnnDataDict[str, X_NOTATION]:
275
- if self._layers is None:
276
- self._link_layers()
277
- return self._layers
278
-
279
- @property
280
- def obsm(self) -> CapAnnDataDict[str, X_NOTATION]:
281
- if self._obsm is None:
282
- self._link_obsm()
283
- return self._obsm
284
-
285
- @property
286
- def varm(self) -> CapAnnDataDict[str, X_NOTATION]:
287
- if self._varm is None:
288
- self._link_varm()
289
- return self._varm
290
-
291
- @property
292
- def obsp(self) -> CapAnnDataDict[str, X_NOTATION]:
293
- if self._obsp is None:
294
- self._link_obsp()
295
- return self._obsp
296
-
297
- @property
298
- def varp(self) -> CapAnnDataDict[str, X_NOTATION]:
299
- if self._varp is None:
300
- self._link_varp()
301
- return self._varp
302
-
303
- def read_obs(self, columns: List[str] = None, reset: bool = False) -> None:
304
- df = self._read_df("obs", columns=columns)
305
- if self.obs.empty or reset:
306
- self._obs = df
307
- else:
308
- for col in df.columns:
309
- self._obs[col] = df[col]
310
-
311
- def read_var(self, columns: List[str] = None, reset: bool = False) -> None:
312
- df = self._read_df("var", columns=columns)
313
- if self.var.empty or reset:
314
- self._var = df
315
- else:
316
- for col in df.columns:
317
- self._var[col] = df[col]
318
-
319
- def read_uns(self, keys: List[str] = None) -> None:
320
- if keys is None:
321
- keys = list(self.uns.keys())
322
-
323
- for key in keys:
324
- existing_keys = self.uns.keys()
325
- if key in existing_keys:
326
- source = self._file[f"uns/{key}"]
327
- self.uns[key] = read_elem(source)
328
-
329
- def _link_layers(self) -> None:
330
- if self._layers is None:
331
- self._layers = CapAnnDataDict()
332
- if "layers" in self._file.keys():
333
- self._link_array_mapping(cap_dict=self._layers, key="layers")
334
-
335
- def _link_obsm(self) -> None:
336
- key = "obsm"
337
- if self._obsm is None:
338
- self._obsm = CapAnnDataDict()
339
- if key in self._file.keys():
340
- self._link_array_mapping(cap_dict=self._obsm, key=key)
341
-
342
- def _link_varm(self) -> None:
343
- key = "varm"
344
- if self._varm is None:
345
- self._varm = CapAnnDataDict()
346
- if key in self._file.keys():
347
- self._link_array_mapping(cap_dict=self._varm, key=key)
348
-
349
- def _link_obsp(self):
350
- key = "obsp"
351
- if self._obsp is None:
352
- self._obsp = CapAnnDataDict()
353
-
354
- if key in self._file.keys():
355
- self._link_array_mapping(cap_dict=self._obsp, key=key)
356
-
357
- def _link_varp(self):
358
- key = "varp"
359
- if self._varp is None:
360
- self._varp = CapAnnDataDict()
361
-
362
- if key in self._file.keys():
363
- self._link_array_mapping(cap_dict=self._varp, key=key)
364
-
365
- def obsm_keys(self) -> List[str]:
366
- return list(self.obsm.keys())
367
-
368
- def obs_keys(self) -> List[str]:
369
- return self.obs.column_order.tolist()
370
-
371
- def var_keys(self) -> List[str]:
372
- return self.var.column_order.tolist()
373
-
374
- def overwrite(self, fields: List[str] = None, compression: str = "lzf") -> None:
375
- field_to_entity = {
376
- "obs": self.obs,
377
- "var": self.var,
378
- "raw.var": self.raw.var if self.raw is not None else None,
379
- "uns": self.uns,
380
- "layers": self.layers,
381
- "obsm": self.obsm,
382
- "varm": self.varm,
383
- "obsp": self.obsp,
384
- "varp": self.varp,
385
- }
386
-
387
- if fields is None:
388
- fields = list(field_to_entity.keys())
389
- else:
390
- for f in fields:
391
- if f not in field_to_entity.keys():
392
- raise KeyError(
393
- f"The field {f} is not supported! The list of supported fields are equal to supported "
394
- f"attributes of the CapAnnData class: obs, var, raw.var and uns."
395
- )
396
-
397
- for key in ["obs", "var", "raw.var"]:
398
- if key in fields:
399
- entity: CapAnnDataDF = field_to_entity[key]
400
- if entity is None:
401
- continue
402
-
403
- key = key.replace(".", "/") if key == "raw.var" else key
404
-
405
- for col in entity.columns:
406
- self._write_elem(
407
- f"{key}/{col}", entity[col].values, compression=compression
408
- )
409
-
410
- column_order = entity.column_order
411
- if (
412
- column_order.size == 0
413
- ): # Refs https://github.com/cellannotation/cap-anndata/issues/6
414
- column_order = np.array([], dtype=np.float64)
415
- self._file[key].attrs["column-order"] = column_order
416
-
417
- if "uns" in fields:
418
- for key in self.uns.keys():
419
- if self.uns[key] is not NotLinkedObject:
420
- dest = f"uns/{key}"
421
- self._write_elem(dest, self.uns[key], compression=compression)
422
- for key in self.uns.keys_to_remove:
423
- del self._file[f"uns/{key}"]
424
-
425
- for field in ["layers", "obsm", "varm", "obsp", "varp"]:
426
- if field in fields:
427
- for key in field_to_entity[field].keys_to_remove:
428
- del self._file[f"{field}/{key}"]
429
-
430
- def create_layer(
431
- self,
432
- name: str,
433
- matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
434
- matrix_shape: Union[tuple[int, int], None] = None,
435
- data_dtype: Union[np.dtype, None] = None,
436
- format: Union[str, None] = None,
437
- compression: str = "lzf",
438
- ) -> None:
439
- """
440
- The empty layer will be created in the case of `matrix` is None.
441
- """
442
- self._create_new_matrix_in_field(
443
- field="layers",
444
- name=name,
445
- matrix=matrix,
446
- matrix_shape=matrix_shape,
447
- data_dtype=data_dtype,
448
- format=format,
449
- compression=compression,
450
- )
451
- self._link_layers()
452
-
453
- def create_obsm(
454
- self,
455
- name: str,
456
- matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
457
- matrix_shape: Union[tuple[int, int], None] = None,
458
- data_dtype: Union[np.dtype, None] = None,
459
- format: Union[str, None] = None,
460
- compression: str = "lzf",
461
- ) -> None:
462
- self._create_new_matrix_in_field(
463
- field="obsm",
464
- name=name,
465
- matrix=matrix,
466
- matrix_shape=matrix_shape,
467
- data_dtype=data_dtype,
468
- format=format,
469
- compression=compression,
470
- )
471
- self._link_obsm()
472
-
473
- def create_varm(
474
- self,
475
- name: str,
476
- matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
477
- matrix_shape: Union[tuple[int, int], None] = None,
478
- data_dtype: Union[np.dtype, None] = None,
479
- format: Union[str, None] = None,
480
- compression: str = "lzf",
481
- ) -> None:
482
- self._create_new_matrix_in_field(
483
- field="varm",
484
- name=name,
485
- matrix=matrix,
486
- matrix_shape=matrix_shape,
487
- data_dtype=data_dtype,
488
- format=format,
489
- compression=compression,
490
- )
491
- self._link_varm()
492
-
493
- def create_obsp(
494
- self,
495
- name: str,
496
- matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
497
- matrix_shape: Union[tuple[int, int], None] = None,
498
- data_dtype: Union[np.dtype, None] = None,
499
- format: Union[str, None] = None,
500
- compression: str = "lzf",
501
- ) -> None:
502
- self._create_new_matrix_in_field(
503
- field="obsp",
504
- name=name,
505
- matrix=matrix,
506
- matrix_shape=matrix_shape,
507
- data_dtype=data_dtype,
508
- format=format,
509
- compression=compression,
510
- )
511
- self._link_obsp()
512
-
513
- def create_varp(
514
- self,
515
- name: str,
516
- matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
517
- matrix_shape: Union[tuple[int, int], None] = None,
518
- data_dtype: Union[np.dtype, None] = None,
519
- format: Union[str, None] = None,
520
- compression: str = "lzf",
521
- ) -> None:
522
-
523
- self._create_new_matrix_in_field(
524
- field="varp",
525
- name=name,
526
- matrix=matrix,
527
- matrix_shape=matrix_shape,
528
- data_dtype=data_dtype,
529
- format=format,
530
- compression=compression,
531
- )
532
- self._link_varp()
533
-
534
- def _create_new_matrix_in_field(self, field, name, **kwargs):
535
- """**kwargs: matrix, matrix_shape, data_dtype, format, compression"""
536
- dest = f"{field}/{name}"
537
- field_entity = getattr(self, field)
538
- if name in field_entity.keys():
539
- raise ValueError(
540
- f"Please explicitly remove the existing '{name}' entity from {field} "
541
- f"before creating a new one!"
542
- )
543
- if field not in self._file.keys():
544
- self._file.create_group(field)
545
- self._create_new_matrix(dest=dest, **kwargs)
546
-
547
- def remove_layer(self, name: str) -> None:
548
- del self._file[f"layers/{name}"]
549
- self._link_layers()
550
-
551
- def remove_obsp(self, name: str) -> None:
552
- del self._file[f"obsp/{name}"]
553
- self._link_obsp()
554
-
555
- def remove_varp(self, name: str) -> None:
556
- del self._file[f"varp/{name}"]
557
- self._link_varp()
558
-
559
- def remove_obsm(self, name: str) -> None:
560
- del self._file[f"obsm/{name}"]
561
- self._link_obsm()
562
-
563
- def remove_varm(self, name: str) -> None:
564
- del self._file[f"varm/{name}"]
565
- self._link_varm()
566
-
567
- def create_repr(self) -> str:
568
- indent = " " * 4
569
- s = f"CapAnnData object"
570
- s += f"\n{indent}File: {self._file}"
571
- s += f"\n{indent}X shape: {self.shape}"
572
- s += f"\n{indent}Has raw X: {self.raw is not None}"
573
- for field in ["obs", "obsm", "var", "uns", "layers"]:
574
- if field in self._file:
575
- in_memory = set()
576
- if field in ["obs", "var", "uns"]:
577
- attr = getattr(self, field)
578
- if attr is not None:
579
- in_memory = set(attr.keys())
580
- keys = list(self._file[field].keys())
581
- keys = [k for k in keys if k != "_index"]
582
- keys = [(k if k not in in_memory else f"{k}*") for k in keys]
583
- keys_str = str(keys).replace("*'", "'*")
584
- s += f"\n{indent}{field}: {keys_str}"
585
- s += f"\n{indent}Note: fields marked with * are in-memory objects."
586
- return s
587
-
588
- def __repr__(self) -> str:
589
- return self.create_repr()
590
-
591
- def __str__(self) -> str:
592
- return self.create_repr()
593
-
594
- def __enter__(self):
595
- return self
596
-
597
- def __exit__(self, *args):
598
- if self._file is not None:
599
- self._file.close()
600
- logger.debug("CapAnnData closed!")
1
+ import logging
2
+ import anndata as ad
3
+ import numpy as np
4
+ import h5py
5
+ from typing import List, Union, Any, Tuple, Final
6
+ import scipy.sparse as ss
7
+ from packaging import version
8
+
9
+ if version.parse(ad.__version__) < version.parse("0.11.0"):
10
+ from anndata.experimental import (
11
+ sparse_dataset,
12
+ read_elem,
13
+ write_elem,
14
+ CSRDataset,
15
+ CSCDataset,
16
+ )
17
+ else:
18
+ from anndata.io import (
19
+ sparse_dataset,
20
+ read_elem,
21
+ write_elem,
22
+ )
23
+ from anndata.abc import (
24
+ CSRDataset,
25
+ CSCDataset,
26
+ )
27
+
28
+ from cap_anndata import CapAnnDataDF, CapAnnDataDict
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+ X_NOTATION = Union[
33
+ h5py.Dataset, CSRDataset, CSCDataset, None
34
+ ]
35
+ ARRAY_MAPPING_NOTATION = CapAnnDataDict[str, X_NOTATION]
36
+ FIELDS_SUPPORTED_TO_OVERWRITE = ["obs", "var", "raw.var", "uns", "layers", "obsm", "varm", "obsp", "varp"]
37
+ NotLinkedObject: Final = "__NotLinkedObject"
38
+
39
+
40
+ class BaseLayerMatrixAndDf:
41
+ def __init__(self, file: h5py.File, path_to_content: str = "/") -> None:
42
+ self._file = file
43
+ self._path_to_content = path_to_content
44
+ self._X: X_NOTATION = None
45
+
46
+ @property
47
+ def file(self) -> h5py.File:
48
+ return self._file
49
+
50
+ @property
51
+ def X(self) -> X_NOTATION:
52
+ if self._X is None:
53
+ self._link_x()
54
+ return self._X
55
+
56
+ def _link_x(self) -> None:
57
+ x = self._file[self._path_to_content + "X"]
58
+ if isinstance(x, h5py.Dataset):
59
+ # dense X
60
+ self._X = x
61
+ else:
62
+ # sparse dataset
63
+ self._X = sparse_dataset(x)
64
+
65
+ @property
66
+ def shape(self) -> Tuple[int, int]:
67
+ if self.X is not None:
68
+ shape = tuple(map(int, self.X.shape))
69
+ else:
70
+ shape = None
71
+ return shape
72
+
73
+ def _lazy_df_load(self, key: str) -> CapAnnDataDF:
74
+ return self._read_df(key=key, columns=[])
75
+
76
+ @staticmethod
77
+ def _read_attr(obj: Union[h5py.Group, h5py.Dataset], attr_name: str) -> any:
78
+ attrs = dict(obj.attrs)
79
+ if attr_name not in attrs.keys():
80
+ raise KeyError(f"The {attr_name} doesn't exist!")
81
+ return attrs[attr_name]
82
+
83
+ def _read_df(self, key: str, columns: List[str]) -> CapAnnDataDF:
84
+ group_path = self._path_to_content + key
85
+ if group_path not in self._file.keys():
86
+ raise ValueError(f"The group {group_path} doesn't exist in the file!")
87
+
88
+ h5_group = self._file[group_path]
89
+
90
+ column_order = self._read_attr(h5_group, "column-order")
91
+
92
+ if columns is None:
93
+ # read whole df
94
+ df = CapAnnDataDF.from_df(read_elem(h5_group), column_order=column_order)
95
+ else:
96
+ if isinstance(columns, str):
97
+ # single column provided instead of list
98
+ columns = [columns]
99
+ cols_to_read = [c for c in columns if c in column_order]
100
+ df = CapAnnDataDF()
101
+ df.column_order = column_order
102
+
103
+ index_col = self._read_attr(h5_group, "_index")
104
+ index = read_elem(h5_group[index_col])
105
+ df.index = index
106
+
107
+ for col in cols_to_read:
108
+ df[col] = read_elem(h5_group[col])
109
+
110
+ if df.column_order.dtype != object:
111
+ # empty DataFrame will have column_order as float64
112
+ # which leads to failure in overwrite method
113
+ df.column_order = df.column_order.astype(object)
114
+ return df
115
+
116
+ def _write_elem(self, dest_key: str, elem: any, compression: str) -> None:
117
+ write_elem(
118
+ self._file, dest_key, elem, dataset_kwargs={"compression": compression}
119
+ )
120
+
121
+ def _validate_cap_df(self, cap_df: CapAnnDataDF, axis: int) -> None:
122
+ if not isinstance(cap_df, CapAnnDataDF):
123
+ raise TypeError(
124
+ f"The input should be an instance of CapAnnDataDF class but {type(cap_df)} given!"
125
+ )
126
+
127
+ if axis not in [0, 1]:
128
+ raise ValueError("The axis should be either 0 or 1!")
129
+
130
+ if cap_df.shape[0] != self.shape[axis]:
131
+ items = "cells" if axis == 0 else "genes"
132
+ raise ValueError(
133
+ f"The number of rows in the input DataFrame should be equal to the number of {items} in the "
134
+ "AnnData object!"
135
+ )
136
+
137
+ def _link_array_mapping(self, cap_dict: CapAnnDataDict, key: str) -> None:
138
+ """Method to update given cap_dict with backed array entities from the file."""
139
+ if key not in self._file.keys():
140
+ raise KeyError(f"The key {key} doesn't exist in the file! Ignore linking.")
141
+
142
+ group = self._file[key]
143
+ if not isinstance(group, h5py.Group):
144
+ raise ValueError(f"The object {key} must be a group!")
145
+
146
+ for array_name in group.keys():
147
+ array = group[array_name]
148
+ if isinstance(array, h5py.Dataset):
149
+ cap_dict[array_name] = array
150
+ elif isinstance(array, h5py.Group):
151
+ cap_dict[array_name] = sparse_dataset(array)
152
+ else:
153
+ raise ValueError(
154
+ f"Can't link array in {key} due to unsupported type of object: {type(array)}"
155
+ )
156
+
157
+ def _create_new_matrix(
158
+ self,
159
+ dest: str,
160
+ matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
161
+ matrix_shape: Union[tuple[int, int], None] = None,
162
+ data_dtype: Union[np.dtype, None] = None,
163
+ format: Union[str, None] = None, # TODO: use Enum instead of str
164
+ compression: str = "lzf",
165
+ ) -> None:
166
+ if matrix is not None:
167
+ self._write_elem(dest, matrix, compression=compression)
168
+ else:
169
+ if format == "dense":
170
+ group = self._file.create_dataset(
171
+ name=dest,
172
+ shape=matrix_shape,
173
+ dtype=data_dtype,
174
+ compression=compression,
175
+ )
176
+ # https://anndata.readthedocs.io/en/latest/fileformat-prose.html#dense-arrays-specification-v0-2-0
177
+ group.attrs["encoding-type"] = "array"
178
+ group.attrs["encoding-version"] = "0.2.0"
179
+ elif format in [
180
+ "csr",
181
+ "csc",
182
+ ]: # Based on https://github.com/appier/h5sparse/blob/master/h5sparse/h5sparse.py
183
+ if data_dtype is None:
184
+ data_dtype = np.float64
185
+ if matrix_shape is None:
186
+ matrix_shape = (0, 0)
187
+ sparse_class = ss.csr_matrix if format == "csr" else ss.csc_matrix
188
+ data = sparse_class(matrix_shape, dtype=data_dtype)
189
+ self._write_elem(dest, data, compression=compression)
190
+ else:
191
+ raise NotImplementedError(
192
+ f"Format must be 'dense', 'csr' or 'csc' but {format} given!"
193
+ )
194
+
195
+
196
+ class RawLayer(BaseLayerMatrixAndDf):
197
+ def __init__(self, h5_file: h5py.File):
198
+ super().__init__(h5_file, path_to_content="/raw/")
199
+ self._var: CapAnnDataDF = None
200
+
201
+ @property
202
+ def var(self) -> CapAnnDataDF:
203
+ if self._var is None:
204
+ self._var = self._lazy_df_load("var")
205
+ return self._var
206
+
207
+ @var.setter
208
+ def var(self, cap_df: CapAnnDataDF) -> None:
209
+ self._validate_cap_df(cap_df, axis=1)
210
+ self._var = cap_df
211
+
212
+ def read_var(self, columns: List[str] = None, reset: bool = False) -> None:
213
+ df = self._read_df(key="var", columns=columns)
214
+ if self.var.empty or reset:
215
+ self._var = df
216
+ else:
217
+ for col in df.columns:
218
+ self._var[col] = df[col]
219
+
220
+
221
+ class CapAnnData(BaseLayerMatrixAndDf):
222
+ def __init__(self, h5_file: h5py.File) -> None:
223
+ super().__init__(h5_file, path_to_content="/")
224
+ self._file: h5py.File = h5_file
225
+ self._obs: CapAnnDataDF = None
226
+ self._var: CapAnnDataDF = None
227
+ self._X: X_NOTATION = None
228
+ self._obsm: CapAnnDataDict = None
229
+ self._varm: CapAnnDataDict = None
230
+ self._layers: CapAnnDataDict = None
231
+ self._uns: CapAnnDataDict = None
232
+ self._obsp: CapAnnDataDict = None
233
+ self._varp: CapAnnDataDict = None
234
+ self._raw: RawLayer = None
235
+ self._shape: Tuple[int, int] = None
236
+
237
+ @property
238
+ def obs(self) -> CapAnnDataDF:
239
+ if self._obs is None:
240
+ self._obs = self._lazy_df_load("obs")
241
+ return self._obs
242
+
243
+ @obs.setter
244
+ def obs(self, cap_df: CapAnnDataDF) -> None:
245
+ self._validate_cap_df(cap_df, axis=0)
246
+ self._obs = cap_df
247
+
248
+ @property
249
+ def var(self) -> CapAnnDataDF:
250
+ if self._var is None:
251
+ self._var = self._lazy_df_load("var")
252
+ return self._var
253
+
254
+ @var.setter
255
+ def var(self, cap_df: CapAnnDataDF) -> None:
256
+ self._validate_cap_df(cap_df, axis=1)
257
+ self._var = cap_df
258
+
259
+ @property
260
+ def raw(self) -> RawLayer:
261
+ if self._raw is None:
262
+ if "raw" not in self._file.keys():
263
+ logger.warning("Can't read raw.var since raw layer doesn't exist!")
264
+ return
265
+
266
+ if len(self._file["raw"].keys()) == 0:
267
+ logger.warning("The raw layer is empty!")
268
+ return
269
+
270
+ self._raw = RawLayer(self._file)
271
+ return self._raw
272
+
273
+ @property
274
+ def uns(self) -> CapAnnDataDict[str, Any]:
275
+ if self._uns is None:
276
+ self._uns = CapAnnDataDict(
277
+ {k: NotLinkedObject for k in self._file["uns"].keys()}
278
+ )
279
+ return self._uns
280
+
281
+ @property
282
+ def layers(self) -> CapAnnDataDict[str, X_NOTATION]:
283
+ if self._layers is None:
284
+ self._link_layers()
285
+ return self._layers
286
+
287
+ @property
288
+ def obsm(self) -> CapAnnDataDict[str, X_NOTATION]:
289
+ if self._obsm is None:
290
+ self._link_obsm()
291
+ return self._obsm
292
+
293
+ @property
294
+ def varm(self) -> CapAnnDataDict[str, X_NOTATION]:
295
+ if self._varm is None:
296
+ self._link_varm()
297
+ return self._varm
298
+
299
+ @property
300
+ def obsp(self) -> CapAnnDataDict[str, X_NOTATION]:
301
+ if self._obsp is None:
302
+ self._link_obsp()
303
+ return self._obsp
304
+
305
+ @property
306
+ def varp(self) -> CapAnnDataDict[str, X_NOTATION]:
307
+ if self._varp is None:
308
+ self._link_varp()
309
+ return self._varp
310
+
311
+ def read_obs(self, columns: List[str] = None, reset: bool = False) -> None:
312
+ df = self._read_df("obs", columns=columns)
313
+ if self.obs.empty or reset:
314
+ self._obs = df
315
+ else:
316
+ for col in df.columns:
317
+ self._obs[col] = df[col]
318
+
319
+ def read_var(self, columns: List[str] = None, reset: bool = False) -> None:
320
+ df = self._read_df("var", columns=columns)
321
+ if self.var.empty or reset:
322
+ self._var = df
323
+ else:
324
+ for col in df.columns:
325
+ self._var[col] = df[col]
326
+
327
+ def read_uns(self, keys: List[str] = None) -> None:
328
+ if keys is None:
329
+ keys = list(self.uns.keys())
330
+
331
+ for key in keys:
332
+ existing_keys = self.uns.keys()
333
+ if key in existing_keys:
334
+ source = self._file[f"uns/{key}"]
335
+ self.uns[key] = read_elem(source)
336
+
337
+ def _link_layers(self) -> None:
338
+ if self._layers is None:
339
+ self._layers = CapAnnDataDict()
340
+ if "layers" in self._file.keys():
341
+ self._link_array_mapping(cap_dict=self._layers, key="layers")
342
+
343
+ def _link_obsm(self) -> None:
344
+ key = "obsm"
345
+ if self._obsm is None:
346
+ self._obsm = CapAnnDataDict()
347
+ if key in self._file.keys():
348
+ self._link_array_mapping(cap_dict=self._obsm, key=key)
349
+
350
+ def _link_varm(self) -> None:
351
+ key = "varm"
352
+ if self._varm is None:
353
+ self._varm = CapAnnDataDict()
354
+ if key in self._file.keys():
355
+ self._link_array_mapping(cap_dict=self._varm, key=key)
356
+
357
+ def _link_obsp(self):
358
+ key = "obsp"
359
+ if self._obsp is None:
360
+ self._obsp = CapAnnDataDict()
361
+
362
+ if key in self._file.keys():
363
+ self._link_array_mapping(cap_dict=self._obsp, key=key)
364
+
365
+ def _link_varp(self):
366
+ key = "varp"
367
+ if self._varp is None:
368
+ self._varp = CapAnnDataDict()
369
+
370
+ if key in self._file.keys():
371
+ self._link_array_mapping(cap_dict=self._varp, key=key)
372
+
373
+ def obsm_keys(self) -> List[str]:
374
+ return list(self.obsm.keys())
375
+
376
+ def obs_keys(self) -> List[str]:
377
+ return self.obs.column_order_array().tolist()
378
+
379
+ def var_keys(self) -> List[str]:
380
+ return self.var.column_order_array().tolist()
381
+
382
+ def field_to_entity(self, key):
383
+ if key == "obs":
384
+ return self.obs
385
+ elif key == "var":
386
+ return self.var
387
+ elif key == "raw.var":
388
+ return self.raw.var if self.raw is not None else None
389
+ elif key == "uns":
390
+ return self.uns
391
+ elif key == "layers":
392
+ return self.layers
393
+ elif key == "obsm":
394
+ return self.obsm
395
+ elif key == "varm":
396
+ return self.varm
397
+ elif key == "obsp":
398
+ return self.obsp
399
+ elif key == "varp":
400
+ return self.varp
401
+ else:
402
+ raise KeyError(
403
+ f"The field {key} is not supported! The list of supported fields are equal to {FIELDS_SUPPORTED_TO_OVERWRITE} "
404
+ f"attributes of the CapAnnData class."
405
+ )
406
+
407
+ def overwrite(self, fields: List[str] = None, compression: str = "lzf") -> None:
408
+ if fields is None:
409
+ fields = FIELDS_SUPPORTED_TO_OVERWRITE
410
+
411
+ for key in ["obs", "var", "raw.var"]:
412
+ if key in fields:
413
+ entity: CapAnnDataDF = self.field_to_entity(key)
414
+ if entity is None:
415
+ continue
416
+
417
+ key = key.replace(".", "/") if key == "raw.var" else key
418
+
419
+ for col in entity.columns:
420
+ self._write_elem(
421
+ f"{key}/{col}", entity[col].values, compression=compression
422
+ )
423
+
424
+ column_order = entity.column_order_array()
425
+ if (
426
+ column_order.size == 0
427
+ ): # Refs https://github.com/cellannotation/cap-anndata/issues/6
428
+ column_order = np.array([], dtype=np.float64)
429
+
430
+ # Index update
431
+ index_name = entity.index.name
432
+ if not index_name:
433
+ index_name = "_index"
434
+ self._file[key].attrs["_index"] = index_name
435
+ index_col = self._read_attr(self._file[key], "_index")
436
+ self._write_elem(
437
+ f"{key}/{index_col}", entity.index.to_numpy(), compression=compression
438
+ )
439
+
440
+ self._file[key].attrs["column-order"] = column_order
441
+
442
+ if "uns" in fields:
443
+ for key in self.uns.keys():
444
+ if self.uns[key] is not NotLinkedObject:
445
+ dest = f"uns/{key}"
446
+ self._write_elem(dest, self.uns[key], compression=compression)
447
+ for key in self.uns.keys_to_remove:
448
+ del self._file[f"uns/{key}"]
449
+
450
+ for field in ["layers", "obsm", "varm", "obsp", "varp"]:
451
+ if field in fields:
452
+ for key in self.field_to_entity(field).keys_to_remove:
453
+ del self._file[f"{field}/{key}"]
454
+
455
+ def create_layer(
456
+ self,
457
+ name: str,
458
+ matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
459
+ matrix_shape: Union[tuple[int, int], None] = None,
460
+ data_dtype: Union[np.dtype, None] = None,
461
+ format: Union[str, None] = None,
462
+ compression: str = "lzf",
463
+ ) -> None:
464
+ """
465
+ The empty layer will be created in the case of `matrix` is None.
466
+ """
467
+ self._create_new_matrix_in_field(
468
+ field="layers",
469
+ name=name,
470
+ matrix=matrix,
471
+ matrix_shape=matrix_shape,
472
+ data_dtype=data_dtype,
473
+ format=format,
474
+ compression=compression,
475
+ )
476
+ self._link_layers()
477
+
478
+ def create_obsm(
479
+ self,
480
+ name: str,
481
+ matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
482
+ matrix_shape: Union[tuple[int, int], None] = None,
483
+ data_dtype: Union[np.dtype, None] = None,
484
+ format: Union[str, None] = None,
485
+ compression: str = "lzf",
486
+ ) -> None:
487
+ self._create_new_matrix_in_field(
488
+ field="obsm",
489
+ name=name,
490
+ matrix=matrix,
491
+ matrix_shape=matrix_shape,
492
+ data_dtype=data_dtype,
493
+ format=format,
494
+ compression=compression,
495
+ )
496
+ self._link_obsm()
497
+
498
+ def create_varm(
499
+ self,
500
+ name: str,
501
+ matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
502
+ matrix_shape: Union[tuple[int, int], None] = None,
503
+ data_dtype: Union[np.dtype, None] = None,
504
+ format: Union[str, None] = None,
505
+ compression: str = "lzf",
506
+ ) -> None:
507
+ self._create_new_matrix_in_field(
508
+ field="varm",
509
+ name=name,
510
+ matrix=matrix,
511
+ matrix_shape=matrix_shape,
512
+ data_dtype=data_dtype,
513
+ format=format,
514
+ compression=compression,
515
+ )
516
+ self._link_varm()
517
+
518
+ def create_obsp(
519
+ self,
520
+ name: str,
521
+ matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
522
+ matrix_shape: Union[tuple[int, int], None] = None,
523
+ data_dtype: Union[np.dtype, None] = None,
524
+ format: Union[str, None] = None,
525
+ compression: str = "lzf",
526
+ ) -> None:
527
+ self._create_new_matrix_in_field(
528
+ field="obsp",
529
+ name=name,
530
+ matrix=matrix,
531
+ matrix_shape=matrix_shape,
532
+ data_dtype=data_dtype,
533
+ format=format,
534
+ compression=compression,
535
+ )
536
+ self._link_obsp()
537
+
538
+ def create_varp(
539
+ self,
540
+ name: str,
541
+ matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
542
+ matrix_shape: Union[tuple[int, int], None] = None,
543
+ data_dtype: Union[np.dtype, None] = None,
544
+ format: Union[str, None] = None,
545
+ compression: str = "lzf",
546
+ ) -> None:
547
+
548
+ self._create_new_matrix_in_field(
549
+ field="varp",
550
+ name=name,
551
+ matrix=matrix,
552
+ matrix_shape=matrix_shape,
553
+ data_dtype=data_dtype,
554
+ format=format,
555
+ compression=compression,
556
+ )
557
+ self._link_varp()
558
+
559
+ def _create_new_matrix_in_field(self, field, name, **kwargs):
560
+ """**kwargs: matrix, matrix_shape, data_dtype, format, compression"""
561
+ dest = f"{field}/{name}"
562
+ field_entity = getattr(self, field)
563
+ if name in field_entity.keys():
564
+ raise ValueError(
565
+ f"Please explicitly remove the existing '{name}' entity from {field} "
566
+ f"before creating a new one!"
567
+ )
568
+ if field not in self._file.keys():
569
+ self._file.create_group(field)
570
+ self._create_new_matrix(dest=dest, **kwargs)
571
+
572
+ def remove_layer(self, name: str) -> None:
573
+ del self._file[f"layers/{name}"]
574
+ self._link_layers()
575
+
576
+ def remove_obsp(self, name: str) -> None:
577
+ del self._file[f"obsp/{name}"]
578
+ self._link_obsp()
579
+
580
+ def remove_varp(self, name: str) -> None:
581
+ del self._file[f"varp/{name}"]
582
+ self._link_varp()
583
+
584
+ def remove_obsm(self, name: str) -> None:
585
+ del self._file[f"obsm/{name}"]
586
+ self._link_obsm()
587
+
588
+ def remove_varm(self, name: str) -> None:
589
+ del self._file[f"varm/{name}"]
590
+ self._link_varm()
591
+
592
+ def create_repr(self) -> str:
593
+ indent = " " * 4
594
+ s = f"CapAnnData object"
595
+ s += f"\n{indent}File: {self._file}"
596
+ s += f"\n{indent}X shape: {self.shape}"
597
+ s += f"\n{indent}Has raw X: {self.raw is not None}"
598
+ for field in ["obs", "obsm", "var", "uns", "layers"]:
599
+ if field in self._file:
600
+ in_memory = set()
601
+ if field in ["obs", "var", "uns"]:
602
+ attr = getattr(self, field)
603
+ if attr is not None:
604
+ in_memory = set(attr.keys())
605
+ keys = list(self._file[field].keys())
606
+ keys = [k for k in keys if k != "_index"]
607
+ keys = [(k if k not in in_memory else f"{k}*") for k in keys]
608
+ keys_str = str(keys).replace("*'", "'*")
609
+ s += f"\n{indent}{field}: {keys_str}"
610
+ s += f"\n{indent}Note: fields marked with * are in-memory objects."
611
+ return s
612
+
613
+ def __repr__(self) -> str:
614
+ return self.create_repr()
615
+
616
+ def __str__(self) -> str:
617
+ return self.create_repr()
618
+
619
+ def __enter__(self):
620
+ return self
621
+
622
+ def __exit__(self, *args):
623
+ if self._file is not None:
624
+ self._file.close()
625
+ logger.debug("CapAnnData closed!")