cap-anndata 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,600 +1,625 @@
1
- import logging
2
- import anndata as ad
3
- import numpy as np
4
- import h5py
5
- from typing import List, Union, Any, Tuple, Final
6
- import scipy.sparse as ss
7
- from packaging import version
8
-
9
- if version.parse(ad.__version__) < version.parse("0.11.0"):
10
- from anndata.experimental import sparse_dataset, read_elem, write_elem
11
- else:
12
- from anndata import sparse_dataset, read_elem, write_elem
13
-
14
- from cap_anndata import CapAnnDataDF, CapAnnDataDict
15
-
16
- logger = logging.getLogger(__name__)
17
-
18
- X_NOTATION = Union[
19
- h5py.Dataset, ad.experimental.CSRDataset, ad.experimental.CSCDataset, None
20
- ]
21
- ARRAY_MAPPING_NOTATION = CapAnnDataDict[str, X_NOTATION]
22
-
23
- NotLinkedObject: Final = "__NotLinkedObject"
24
-
25
-
26
- class BaseLayerMatrixAndDf:
27
- def __init__(self, file: h5py.File, path_to_content: str = "/") -> None:
28
- self._file = file
29
- self._path_to_content = path_to_content
30
- self._X: X_NOTATION = None
31
-
32
- @property
33
- def file(self) -> h5py.File:
34
- return self._file
35
-
36
- @property
37
- def X(self) -> X_NOTATION:
38
- if self._X is None:
39
- self._link_x()
40
- return self._X
41
-
42
- def _link_x(self) -> None:
43
- x = self._file[self._path_to_content + "X"]
44
- if isinstance(x, h5py.Dataset):
45
- # dense X
46
- self._X = x
47
- else:
48
- # sparse dataset
49
- self._X = sparse_dataset(x)
50
-
51
- @property
52
- def shape(self) -> Tuple[int, int]:
53
- if self.X is not None:
54
- shape = tuple(map(int, self.X.shape))
55
- else:
56
- shape = None
57
- return shape
58
-
59
- def _lazy_df_load(self, key: str) -> CapAnnDataDF:
60
- df = CapAnnDataDF()
61
- attribute = self._path_to_content + key
62
- column_order = self._read_attr(self._file[attribute], "column-order")
63
- df.column_order = column_order
64
- if df.column_order.dtype != object:
65
- # empty DataFrame will have column_order as float64
66
- # which leads to failure in overwrite method
67
- df.column_order = df.column_order.astype(object)
68
- return df
69
-
70
- @staticmethod
71
- def _read_attr(obj: Union[h5py.Group, h5py.Dataset], attr_name: str) -> any:
72
- attrs = dict(obj.attrs)
73
- if attr_name not in attrs.keys():
74
- raise KeyError(f"The {attr_name} doesn't exist!")
75
- return attrs[attr_name]
76
-
77
- def _read_df(self, key: str, columns: List[str]) -> CapAnnDataDF:
78
- group_path = self._path_to_content + key
79
- if group_path not in self._file.keys():
80
- raise ValueError(f"The group {group_path} doesn't exist in the file!")
81
-
82
- h5_group = self._file[group_path]
83
-
84
- column_order = self._read_attr(h5_group, "column-order")
85
-
86
- if columns is None:
87
- # read whole df
88
- df = CapAnnDataDF.from_df(read_elem(h5_group), column_order=column_order)
89
- else:
90
- if isinstance(columns, str):
91
- # single column provided instead of list
92
- columns = [columns]
93
- cols_to_read = [c for c in columns if c in column_order]
94
- df = CapAnnDataDF()
95
- df.column_order = column_order
96
- index_col = self._read_attr(h5_group, "_index")
97
- df.index = read_elem(h5_group[index_col])
98
-
99
- for col in cols_to_read:
100
- df[col] = read_elem(h5_group[col])
101
-
102
- if df.column_order.dtype != object:
103
- # empty DataFrame will have column_order as float64
104
- # which leads to failure in overwrite method
105
- df.column_order = df.column_order.astype(object)
106
- return df
107
-
108
- def _write_elem(self, dest_key: str, elem: any, compression: str) -> None:
109
- write_elem(
110
- self._file, dest_key, elem, dataset_kwargs={"compression": compression}
111
- )
112
-
113
- def _validate_cap_df(self, cap_df: CapAnnDataDF, axis: int) -> None:
114
- if not isinstance(cap_df, CapAnnDataDF):
115
- raise TypeError(
116
- f"The input should be an instance of CapAnnDataDF class but {type(cap_df)} given!"
117
- )
118
-
119
- if axis not in [0, 1]:
120
- raise ValueError("The axis should be either 0 or 1!")
121
-
122
- if cap_df.shape[0] != self.shape[axis]:
123
- items = "cells" if axis == 0 else "genes"
124
- raise ValueError(
125
- f"The number of rows in the input DataFrame should be equal to the number of {items} in the "
126
- "AnnData object!"
127
- )
128
-
129
- def _link_array_mapping(self, cap_dict: CapAnnDataDict, key: str) -> None:
130
- """Method to update given cap_dict with backed array entities from the file."""
131
- if key not in self._file.keys():
132
- raise KeyError(f"The key {key} doesn't exist in the file! Ignore linking.")
133
-
134
- group = self._file[key]
135
- if not isinstance(group, h5py.Group):
136
- raise ValueError(f"The object {key} must be a group!")
137
-
138
- for array_name in group.keys():
139
- array = group[array_name]
140
- if isinstance(array, h5py.Dataset):
141
- cap_dict[array_name] = array
142
- elif isinstance(array, h5py.Group):
143
- cap_dict[array_name] = sparse_dataset(array)
144
- else:
145
- raise ValueError(
146
- f"Can't link array in {key} due to unsupported type of object: {type(array)}"
147
- )
148
-
149
- def _create_new_matrix(
150
- self,
151
- dest: str,
152
- matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
153
- matrix_shape: Union[tuple[int, int], None] = None,
154
- data_dtype: Union[np.dtype, None] = None,
155
- format: Union[str, None] = None, # TODO: use Enum instead of str
156
- compression: str = "lzf",
157
- ) -> None:
158
- if matrix is not None:
159
- self._write_elem(dest, matrix, compression=compression)
160
- else:
161
- if format == "dense":
162
- group = self._file.create_dataset(
163
- name=dest,
164
- shape=matrix_shape,
165
- dtype=data_dtype,
166
- compression=compression,
167
- )
168
- # https://anndata.readthedocs.io/en/latest/fileformat-prose.html#dense-arrays-specification-v0-2-0
169
- group.attrs["encoding-type"] = "array"
170
- group.attrs["encoding-version"] = "0.2.0"
171
- elif format in [
172
- "csr",
173
- "csc",
174
- ]: # Based on https://github.com/appier/h5sparse/blob/master/h5sparse/h5sparse.py
175
- if data_dtype is None:
176
- data_dtype = np.float64
177
- if matrix_shape is None:
178
- matrix_shape = (0, 0)
179
- sparse_class = ss.csr_matrix if format == "csr" else ss.csc_matrix
180
- data = sparse_class(matrix_shape, dtype=data_dtype)
181
- self._write_elem(dest, data, compression=compression)
182
- else:
183
- raise NotImplementedError(
184
- f"Format must be 'dense', 'csr' or 'csc' but {format} given!"
185
- )
186
-
187
-
188
- class RawLayer(BaseLayerMatrixAndDf):
189
- def __init__(self, h5_file: h5py.File):
190
- super().__init__(h5_file, path_to_content="/raw/")
191
- self._var: CapAnnDataDF = None
192
-
193
- @property
194
- def var(self) -> CapAnnDataDF:
195
- if self._var is None:
196
- self._var = self._lazy_df_load("var")
197
- return self._var
198
-
199
- @var.setter
200
- def var(self, cap_df: CapAnnDataDF) -> None:
201
- self._validate_cap_df(cap_df, axis=1)
202
- self._var = cap_df
203
-
204
- def read_var(self, columns: List[str] = None, reset: bool = False) -> None:
205
- df = self._read_df(key="var", columns=columns)
206
- if self.var.empty or reset:
207
- self._var = df
208
- else:
209
- for col in df.columns:
210
- self._var[col] = df[col]
211
-
212
-
213
- class CapAnnData(BaseLayerMatrixAndDf):
214
- def __init__(self, h5_file: h5py.File) -> None:
215
- super().__init__(h5_file, path_to_content="/")
216
- self._file: h5py.File = h5_file
217
- self._obs: CapAnnDataDF = None
218
- self._var: CapAnnDataDF = None
219
- self._X: X_NOTATION = None
220
- self._obsm: CapAnnDataDict = None
221
- self._varm: CapAnnDataDict = None
222
- self._layers: CapAnnDataDict = None
223
- self._uns: CapAnnDataDict = None
224
- self._obsp: CapAnnDataDict = None
225
- self._varp: CapAnnDataDict = None
226
- self._raw: RawLayer = None
227
- self._shape: Tuple[int, int] = None
228
-
229
- @property
230
- def obs(self) -> CapAnnDataDF:
231
- if self._obs is None:
232
- self._obs = self._lazy_df_load("obs")
233
- return self._obs
234
-
235
- @obs.setter
236
- def obs(self, cap_df: CapAnnDataDF) -> None:
237
- self._validate_cap_df(cap_df, axis=0)
238
- self._obs = cap_df
239
-
240
- @property
241
- def var(self) -> CapAnnDataDF:
242
- if self._var is None:
243
- self._var = self._lazy_df_load("var")
244
- return self._var
245
-
246
- @var.setter
247
- def var(self, cap_df: CapAnnDataDF) -> None:
248
- self._validate_cap_df(cap_df, axis=1)
249
- self._var = cap_df
250
-
251
- @property
252
- def raw(self) -> RawLayer:
253
- if self._raw is None:
254
- if "raw" not in self._file.keys():
255
- logger.warning("Can't read raw.var since raw layer doesn't exist!")
256
- return
257
-
258
- if len(self._file["raw"].keys()) == 0:
259
- logger.warning("The raw layer is empty!")
260
- return
261
-
262
- self._raw = RawLayer(self._file)
263
- return self._raw
264
-
265
- @property
266
- def uns(self) -> CapAnnDataDict[str, Any]:
267
- if self._uns is None:
268
- self._uns = CapAnnDataDict(
269
- {k: NotLinkedObject for k in self._file["uns"].keys()}
270
- )
271
- return self._uns
272
-
273
- @property
274
- def layers(self) -> CapAnnDataDict[str, X_NOTATION]:
275
- if self._layers is None:
276
- self._link_layers()
277
- return self._layers
278
-
279
- @property
280
- def obsm(self) -> CapAnnDataDict[str, X_NOTATION]:
281
- if self._obsm is None:
282
- self._link_obsm()
283
- return self._obsm
284
-
285
- @property
286
- def varm(self) -> CapAnnDataDict[str, X_NOTATION]:
287
- if self._varm is None:
288
- self._link_varm()
289
- return self._varm
290
-
291
- @property
292
- def obsp(self) -> CapAnnDataDict[str, X_NOTATION]:
293
- if self._obsp is None:
294
- self._link_obsp()
295
- return self._obsp
296
-
297
- @property
298
- def varp(self) -> CapAnnDataDict[str, X_NOTATION]:
299
- if self._varp is None:
300
- self._link_varp()
301
- return self._varp
302
-
303
- def read_obs(self, columns: List[str] = None, reset: bool = False) -> None:
304
- df = self._read_df("obs", columns=columns)
305
- if self.obs.empty or reset:
306
- self._obs = df
307
- else:
308
- for col in df.columns:
309
- self._obs[col] = df[col]
310
-
311
- def read_var(self, columns: List[str] = None, reset: bool = False) -> None:
312
- df = self._read_df("var", columns=columns)
313
- if self.var.empty or reset:
314
- self._var = df
315
- else:
316
- for col in df.columns:
317
- self._var[col] = df[col]
318
-
319
- def read_uns(self, keys: List[str] = None) -> None:
320
- if keys is None:
321
- keys = list(self.uns.keys())
322
-
323
- for key in keys:
324
- existing_keys = self.uns.keys()
325
- if key in existing_keys:
326
- source = self._file[f"uns/{key}"]
327
- self.uns[key] = read_elem(source)
328
-
329
- def _link_layers(self) -> None:
330
- if self._layers is None:
331
- self._layers = CapAnnDataDict()
332
- if "layers" in self._file.keys():
333
- self._link_array_mapping(cap_dict=self._layers, key="layers")
334
-
335
- def _link_obsm(self) -> None:
336
- key = "obsm"
337
- if self._obsm is None:
338
- self._obsm = CapAnnDataDict()
339
- if key in self._file.keys():
340
- self._link_array_mapping(cap_dict=self._obsm, key=key)
341
-
342
- def _link_varm(self) -> None:
343
- key = "varm"
344
- if self._varm is None:
345
- self._varm = CapAnnDataDict()
346
- if key in self._file.keys():
347
- self._link_array_mapping(cap_dict=self._varm, key=key)
348
-
349
- def _link_obsp(self):
350
- key = "obsp"
351
- if self._obsp is None:
352
- self._obsp = CapAnnDataDict()
353
-
354
- if key in self._file.keys():
355
- self._link_array_mapping(cap_dict=self._obsp, key=key)
356
-
357
- def _link_varp(self):
358
- key = "varp"
359
- if self._varp is None:
360
- self._varp = CapAnnDataDict()
361
-
362
- if key in self._file.keys():
363
- self._link_array_mapping(cap_dict=self._varp, key=key)
364
-
365
- def obsm_keys(self) -> List[str]:
366
- return list(self.obsm.keys())
367
-
368
- def obs_keys(self) -> List[str]:
369
- return self.obs.column_order.tolist()
370
-
371
- def var_keys(self) -> List[str]:
372
- return self.var.column_order.tolist()
373
-
374
- def overwrite(self, fields: List[str] = None, compression: str = "lzf") -> None:
375
- field_to_entity = {
376
- "obs": self.obs,
377
- "var": self.var,
378
- "raw.var": self.raw.var if self.raw is not None else None,
379
- "uns": self.uns,
380
- "layers": self.layers,
381
- "obsm": self.obsm,
382
- "varm": self.varm,
383
- "obsp": self.obsp,
384
- "varp": self.varp,
385
- }
386
-
387
- if fields is None:
388
- fields = list(field_to_entity.keys())
389
- else:
390
- for f in fields:
391
- if f not in field_to_entity.keys():
392
- raise KeyError(
393
- f"The field {f} is not supported! The list of supported fields are equal to supported "
394
- f"attributes of the CapAnnData class: obs, var, raw.var and uns."
395
- )
396
-
397
- for key in ["obs", "var", "raw.var"]:
398
- if key in fields:
399
- entity: CapAnnDataDF = field_to_entity[key]
400
- if entity is None:
401
- continue
402
-
403
- key = key.replace(".", "/") if key == "raw.var" else key
404
-
405
- for col in entity.columns:
406
- self._write_elem(
407
- f"{key}/{col}", entity[col].values, compression=compression
408
- )
409
-
410
- column_order = entity.column_order
411
- if (
412
- column_order.size == 0
413
- ): # Refs https://github.com/cellannotation/cap-anndata/issues/6
414
- column_order = np.array([], dtype=np.float64)
415
- self._file[key].attrs["column-order"] = column_order
416
-
417
- if "uns" in fields:
418
- for key in self.uns.keys():
419
- if self.uns[key] is not NotLinkedObject:
420
- dest = f"uns/{key}"
421
- self._write_elem(dest, self.uns[key], compression=compression)
422
- for key in self.uns.keys_to_remove:
423
- del self._file[f"uns/{key}"]
424
-
425
- for field in ["layers", "obsm", "varm", "obsp", "varp"]:
426
- if field in fields:
427
- for key in field_to_entity[field].keys_to_remove:
428
- del self._file[f"{field}/{key}"]
429
-
430
- def create_layer(
431
- self,
432
- name: str,
433
- matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
434
- matrix_shape: Union[tuple[int, int], None] = None,
435
- data_dtype: Union[np.dtype, None] = None,
436
- format: Union[str, None] = None,
437
- compression: str = "lzf",
438
- ) -> None:
439
- """
440
- The empty layer will be created in the case of `matrix` is None.
441
- """
442
- self._create_new_matrix_in_field(
443
- field="layers",
444
- name=name,
445
- matrix=matrix,
446
- matrix_shape=matrix_shape,
447
- data_dtype=data_dtype,
448
- format=format,
449
- compression=compression,
450
- )
451
- self._link_layers()
452
-
453
- def create_obsm(
454
- self,
455
- name: str,
456
- matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
457
- matrix_shape: Union[tuple[int, int], None] = None,
458
- data_dtype: Union[np.dtype, None] = None,
459
- format: Union[str, None] = None,
460
- compression: str = "lzf",
461
- ) -> None:
462
- self._create_new_matrix_in_field(
463
- field="obsm",
464
- name=name,
465
- matrix=matrix,
466
- matrix_shape=matrix_shape,
467
- data_dtype=data_dtype,
468
- format=format,
469
- compression=compression,
470
- )
471
- self._link_obsm()
472
-
473
- def create_varm(
474
- self,
475
- name: str,
476
- matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
477
- matrix_shape: Union[tuple[int, int], None] = None,
478
- data_dtype: Union[np.dtype, None] = None,
479
- format: Union[str, None] = None,
480
- compression: str = "lzf",
481
- ) -> None:
482
- self._create_new_matrix_in_field(
483
- field="varm",
484
- name=name,
485
- matrix=matrix,
486
- matrix_shape=matrix_shape,
487
- data_dtype=data_dtype,
488
- format=format,
489
- compression=compression,
490
- )
491
- self._link_varm()
492
-
493
- def create_obsp(
494
- self,
495
- name: str,
496
- matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
497
- matrix_shape: Union[tuple[int, int], None] = None,
498
- data_dtype: Union[np.dtype, None] = None,
499
- format: Union[str, None] = None,
500
- compression: str = "lzf",
501
- ) -> None:
502
- self._create_new_matrix_in_field(
503
- field="obsp",
504
- name=name,
505
- matrix=matrix,
506
- matrix_shape=matrix_shape,
507
- data_dtype=data_dtype,
508
- format=format,
509
- compression=compression,
510
- )
511
- self._link_obsp()
512
-
513
- def create_varp(
514
- self,
515
- name: str,
516
- matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
517
- matrix_shape: Union[tuple[int, int], None] = None,
518
- data_dtype: Union[np.dtype, None] = None,
519
- format: Union[str, None] = None,
520
- compression: str = "lzf",
521
- ) -> None:
522
-
523
- self._create_new_matrix_in_field(
524
- field="varp",
525
- name=name,
526
- matrix=matrix,
527
- matrix_shape=matrix_shape,
528
- data_dtype=data_dtype,
529
- format=format,
530
- compression=compression,
531
- )
532
- self._link_varp()
533
-
534
- def _create_new_matrix_in_field(self, field, name, **kwargs):
535
- """**kwargs: matrix, matrix_shape, data_dtype, format, compression"""
536
- dest = f"{field}/{name}"
537
- field_entity = getattr(self, field)
538
- if name in field_entity.keys():
539
- raise ValueError(
540
- f"Please explicitly remove the existing '{name}' entity from {field} "
541
- f"before creating a new one!"
542
- )
543
- if field not in self._file.keys():
544
- self._file.create_group(field)
545
- self._create_new_matrix(dest=dest, **kwargs)
546
-
547
- def remove_layer(self, name: str) -> None:
548
- del self._file[f"layers/{name}"]
549
- self._link_layers()
550
-
551
- def remove_obsp(self, name: str) -> None:
552
- del self._file[f"obsp/{name}"]
553
- self._link_obsp()
554
-
555
- def remove_varp(self, name: str) -> None:
556
- del self._file[f"varp/{name}"]
557
- self._link_varp()
558
-
559
- def remove_obsm(self, name: str) -> None:
560
- del self._file[f"obsm/{name}"]
561
- self._link_obsm()
562
-
563
- def remove_varm(self, name: str) -> None:
564
- del self._file[f"varm/{name}"]
565
- self._link_varm()
566
-
567
- def create_repr(self) -> str:
568
- indent = " " * 4
569
- s = f"CapAnnData object"
570
- s += f"\n{indent}File: {self._file}"
571
- s += f"\n{indent}X shape: {self.shape}"
572
- s += f"\n{indent}Has raw X: {self.raw is not None}"
573
- for field in ["obs", "obsm", "var", "uns", "layers"]:
574
- if field in self._file:
575
- in_memory = set()
576
- if field in ["obs", "var", "uns"]:
577
- attr = getattr(self, field)
578
- if attr is not None:
579
- in_memory = set(attr.keys())
580
- keys = list(self._file[field].keys())
581
- keys = [k for k in keys if k != "_index"]
582
- keys = [(k if k not in in_memory else f"{k}*") for k in keys]
583
- keys_str = str(keys).replace("*'", "'*")
584
- s += f"\n{indent}{field}: {keys_str}"
585
- s += f"\n{indent}Note: fields marked with * are in-memory objects."
586
- return s
587
-
588
- def __repr__(self) -> str:
589
- return self.create_repr()
590
-
591
- def __str__(self) -> str:
592
- return self.create_repr()
593
-
594
- def __enter__(self):
595
- return self
596
-
597
- def __exit__(self, *args):
598
- if self._file is not None:
599
- self._file.close()
600
- logger.debug("CapAnnData closed!")
1
+ import logging
2
+ import anndata as ad
3
+ import numpy as np
4
+ import h5py
5
+ from typing import List, Union, Any, Tuple, Final
6
+ import scipy.sparse as ss
7
+ from packaging import version
8
+
9
+ if version.parse(ad.__version__) < version.parse("0.11.0"):
10
+ from anndata.experimental import (
11
+ sparse_dataset,
12
+ read_elem,
13
+ write_elem,
14
+ CSRDataset,
15
+ CSCDataset,
16
+ )
17
+ else:
18
+ from anndata.io import (
19
+ sparse_dataset,
20
+ read_elem,
21
+ write_elem,
22
+ )
23
+ from anndata.abc import (
24
+ CSRDataset,
25
+ CSCDataset,
26
+ )
27
+
28
+ from cap_anndata import CapAnnDataDF, CapAnnDataDict
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+ X_NOTATION = Union[
33
+ h5py.Dataset, CSRDataset, CSCDataset, None
34
+ ]
35
+ ARRAY_MAPPING_NOTATION = CapAnnDataDict[str, X_NOTATION]
36
+ FIELDS_SUPPORTED_TO_OVERWRITE = ["obs", "var", "raw.var", "uns", "layers", "obsm", "varm", "obsp", "varp"]
37
+ NotLinkedObject: Final = "__NotLinkedObject"
38
+
39
+
40
+ class BaseLayerMatrixAndDf:
41
+ def __init__(self, file: h5py.File, path_to_content: str = "/") -> None:
42
+ self._file = file
43
+ self._path_to_content = path_to_content
44
+ self._X: X_NOTATION = None
45
+
46
+ @property
47
+ def file(self) -> h5py.File:
48
+ return self._file
49
+
50
+ @property
51
+ def X(self) -> X_NOTATION:
52
+ if self._X is None:
53
+ self._link_x()
54
+ return self._X
55
+
56
+ def _link_x(self) -> None:
57
+ x = self._file[self._path_to_content + "X"]
58
+ if isinstance(x, h5py.Dataset):
59
+ # dense X
60
+ self._X = x
61
+ else:
62
+ # sparse dataset
63
+ self._X = sparse_dataset(x)
64
+
65
+ @property
66
+ def shape(self) -> Tuple[int, int]:
67
+ if self.X is not None:
68
+ shape = tuple(map(int, self.X.shape))
69
+ else:
70
+ shape = None
71
+ return shape
72
+
73
+ def _lazy_df_load(self, key: str) -> CapAnnDataDF:
74
+ return self._read_df(key=key, columns=[])
75
+
76
+ @staticmethod
77
+ def _read_attr(obj: Union[h5py.Group, h5py.Dataset], attr_name: str) -> any:
78
+ attrs = dict(obj.attrs)
79
+ if attr_name not in attrs.keys():
80
+ raise KeyError(f"The {attr_name} doesn't exist!")
81
+ return attrs[attr_name]
82
+
83
+ def _read_df(self, key: str, columns: List[str]) -> CapAnnDataDF:
84
+ group_path = self._path_to_content + key
85
+ if group_path not in self._file.keys():
86
+ raise ValueError(f"The group {group_path} doesn't exist in the file!")
87
+
88
+ h5_group = self._file[group_path]
89
+
90
+ column_order = self._read_attr(h5_group, "column-order")
91
+
92
+ if columns is None:
93
+ # read whole df
94
+ df = CapAnnDataDF.from_df(read_elem(h5_group), column_order=column_order)
95
+ else:
96
+ if isinstance(columns, str):
97
+ # single column provided instead of list
98
+ columns = [columns]
99
+ cols_to_read = [c for c in columns if c in column_order]
100
+ df = CapAnnDataDF()
101
+ df.column_order = column_order
102
+
103
+ index_col = self._read_attr(h5_group, "_index")
104
+ index = read_elem(h5_group[index_col])
105
+ df.index = index
106
+
107
+ for col in cols_to_read:
108
+ df[col] = read_elem(h5_group[col])
109
+
110
+ if df.column_order.dtype != object:
111
+ # empty DataFrame will have column_order as float64
112
+ # which leads to failure in overwrite method
113
+ df.column_order = df.column_order.astype(object)
114
+ return df
115
+
116
+ def _write_elem(self, dest_key: str, elem: any, compression: str) -> None:
117
+ write_elem(
118
+ self._file, dest_key, elem, dataset_kwargs={"compression": compression}
119
+ )
120
+
121
+ def _validate_cap_df(self, cap_df: CapAnnDataDF, axis: int) -> None:
122
+ if not isinstance(cap_df, CapAnnDataDF):
123
+ raise TypeError(
124
+ f"The input should be an instance of CapAnnDataDF class but {type(cap_df)} given!"
125
+ )
126
+
127
+ if axis not in [0, 1]:
128
+ raise ValueError("The axis should be either 0 or 1!")
129
+
130
+ if cap_df.shape[0] != self.shape[axis]:
131
+ items = "cells" if axis == 0 else "genes"
132
+ raise ValueError(
133
+ f"The number of rows in the input DataFrame should be equal to the number of {items} in the "
134
+ "AnnData object!"
135
+ )
136
+
137
+ def _link_array_mapping(self, cap_dict: CapAnnDataDict, key: str) -> None:
138
+ """Method to update given cap_dict with backed array entities from the file."""
139
+ if key not in self._file.keys():
140
+ raise KeyError(f"The key {key} doesn't exist in the file! Ignore linking.")
141
+
142
+ group = self._file[key]
143
+ if not isinstance(group, h5py.Group):
144
+ raise ValueError(f"The object {key} must be a group!")
145
+
146
+ for array_name in group.keys():
147
+ array = group[array_name]
148
+ if isinstance(array, h5py.Dataset):
149
+ cap_dict[array_name] = array
150
+ elif isinstance(array, h5py.Group):
151
+ cap_dict[array_name] = sparse_dataset(array)
152
+ else:
153
+ raise ValueError(
154
+ f"Can't link array in {key} due to unsupported type of object: {type(array)}"
155
+ )
156
+
157
+ def _create_new_matrix(
158
+ self,
159
+ dest: str,
160
+ matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
161
+ matrix_shape: Union[tuple[int, int], None] = None,
162
+ data_dtype: Union[np.dtype, None] = None,
163
+ format: Union[str, None] = None, # TODO: use Enum instead of str
164
+ compression: str = "lzf",
165
+ ) -> None:
166
+ if matrix is not None:
167
+ self._write_elem(dest, matrix, compression=compression)
168
+ else:
169
+ if format == "dense":
170
+ group = self._file.create_dataset(
171
+ name=dest,
172
+ shape=matrix_shape,
173
+ dtype=data_dtype,
174
+ compression=compression,
175
+ )
176
+ # https://anndata.readthedocs.io/en/latest/fileformat-prose.html#dense-arrays-specification-v0-2-0
177
+ group.attrs["encoding-type"] = "array"
178
+ group.attrs["encoding-version"] = "0.2.0"
179
+ elif format in [
180
+ "csr",
181
+ "csc",
182
+ ]: # Based on https://github.com/appier/h5sparse/blob/master/h5sparse/h5sparse.py
183
+ if data_dtype is None:
184
+ data_dtype = np.float64
185
+ if matrix_shape is None:
186
+ matrix_shape = (0, 0)
187
+ sparse_class = ss.csr_matrix if format == "csr" else ss.csc_matrix
188
+ data = sparse_class(matrix_shape, dtype=data_dtype)
189
+ self._write_elem(dest, data, compression=compression)
190
+ else:
191
+ raise NotImplementedError(
192
+ f"Format must be 'dense', 'csr' or 'csc' but {format} given!"
193
+ )
194
+
195
+
196
+ class RawLayer(BaseLayerMatrixAndDf):
197
+ def __init__(self, h5_file: h5py.File):
198
+ super().__init__(h5_file, path_to_content="/raw/")
199
+ self._var: CapAnnDataDF = None
200
+
201
+ @property
202
+ def var(self) -> CapAnnDataDF:
203
+ if self._var is None:
204
+ self._var = self._lazy_df_load("var")
205
+ return self._var
206
+
207
+ @var.setter
208
+ def var(self, cap_df: CapAnnDataDF) -> None:
209
+ self._validate_cap_df(cap_df, axis=1)
210
+ self._var = cap_df
211
+
212
+ def read_var(self, columns: List[str] = None, reset: bool = False) -> None:
213
+ df = self._read_df(key="var", columns=columns)
214
+ if self.var.empty or reset:
215
+ self._var = df
216
+ else:
217
+ for col in df.columns:
218
+ self._var[col] = df[col]
219
+
220
+
221
+ class CapAnnData(BaseLayerMatrixAndDf):
222
+ def __init__(self, h5_file: h5py.File) -> None:
223
+ super().__init__(h5_file, path_to_content="/")
224
+ self._file: h5py.File = h5_file
225
+ self._obs: CapAnnDataDF = None
226
+ self._var: CapAnnDataDF = None
227
+ self._X: X_NOTATION = None
228
+ self._obsm: CapAnnDataDict = None
229
+ self._varm: CapAnnDataDict = None
230
+ self._layers: CapAnnDataDict = None
231
+ self._uns: CapAnnDataDict = None
232
+ self._obsp: CapAnnDataDict = None
233
+ self._varp: CapAnnDataDict = None
234
+ self._raw: RawLayer = None
235
+ self._shape: Tuple[int, int] = None
236
+
237
+ @property
238
+ def obs(self) -> CapAnnDataDF:
239
+ if self._obs is None:
240
+ self._obs = self._lazy_df_load("obs")
241
+ return self._obs
242
+
243
+ @obs.setter
244
+ def obs(self, cap_df: CapAnnDataDF) -> None:
245
+ self._validate_cap_df(cap_df, axis=0)
246
+ self._obs = cap_df
247
+
248
+ @property
249
+ def var(self) -> CapAnnDataDF:
250
+ if self._var is None:
251
+ self._var = self._lazy_df_load("var")
252
+ return self._var
253
+
254
+ @var.setter
255
+ def var(self, cap_df: CapAnnDataDF) -> None:
256
+ self._validate_cap_df(cap_df, axis=1)
257
+ self._var = cap_df
258
+
259
+ @property
260
+ def raw(self) -> RawLayer:
261
+ if self._raw is None:
262
+ if "raw" not in self._file.keys():
263
+ logger.warning("Can't read raw.var since raw layer doesn't exist!")
264
+ return
265
+
266
+ if len(self._file["raw"].keys()) == 0:
267
+ logger.warning("The raw layer is empty!")
268
+ return
269
+
270
+ self._raw = RawLayer(self._file)
271
+ return self._raw
272
+
273
+ @property
274
+ def uns(self) -> CapAnnDataDict[str, Any]:
275
+ if self._uns is None:
276
+ self._uns = CapAnnDataDict(
277
+ {k: NotLinkedObject for k in self._file["uns"].keys()}
278
+ )
279
+ return self._uns
280
+
281
+ @property
282
+ def layers(self) -> CapAnnDataDict[str, X_NOTATION]:
283
+ if self._layers is None:
284
+ self._link_layers()
285
+ return self._layers
286
+
287
+ @property
288
+ def obsm(self) -> CapAnnDataDict[str, X_NOTATION]:
289
+ if self._obsm is None:
290
+ self._link_obsm()
291
+ return self._obsm
292
+
293
+ @property
294
+ def varm(self) -> CapAnnDataDict[str, X_NOTATION]:
295
+ if self._varm is None:
296
+ self._link_varm()
297
+ return self._varm
298
+
299
+ @property
300
+ def obsp(self) -> CapAnnDataDict[str, X_NOTATION]:
301
+ if self._obsp is None:
302
+ self._link_obsp()
303
+ return self._obsp
304
+
305
+ @property
306
+ def varp(self) -> CapAnnDataDict[str, X_NOTATION]:
307
+ if self._varp is None:
308
+ self._link_varp()
309
+ return self._varp
310
+
311
+ def read_obs(self, columns: List[str] = None, reset: bool = False) -> None:
312
+ df = self._read_df("obs", columns=columns)
313
+ if self.obs.empty or reset:
314
+ self._obs = df
315
+ else:
316
+ for col in df.columns:
317
+ self._obs[col] = df[col]
318
+
319
+ def read_var(self, columns: List[str] = None, reset: bool = False) -> None:
320
+ df = self._read_df("var", columns=columns)
321
+ if self.var.empty or reset:
322
+ self._var = df
323
+ else:
324
+ for col in df.columns:
325
+ self._var[col] = df[col]
326
+
327
+ def read_uns(self, keys: List[str] = None) -> None:
328
+ if keys is None:
329
+ keys = list(self.uns.keys())
330
+
331
+ for key in keys:
332
+ existing_keys = self.uns.keys()
333
+ if key in existing_keys:
334
+ source = self._file[f"uns/{key}"]
335
+ self.uns[key] = read_elem(source)
336
+
337
+ def _link_layers(self) -> None:
338
+ if self._layers is None:
339
+ self._layers = CapAnnDataDict()
340
+ if "layers" in self._file.keys():
341
+ self._link_array_mapping(cap_dict=self._layers, key="layers")
342
+
343
+ def _link_obsm(self) -> None:
344
+ key = "obsm"
345
+ if self._obsm is None:
346
+ self._obsm = CapAnnDataDict()
347
+ if key in self._file.keys():
348
+ self._link_array_mapping(cap_dict=self._obsm, key=key)
349
+
350
+ def _link_varm(self) -> None:
351
+ key = "varm"
352
+ if self._varm is None:
353
+ self._varm = CapAnnDataDict()
354
+ if key in self._file.keys():
355
+ self._link_array_mapping(cap_dict=self._varm, key=key)
356
+
357
+ def _link_obsp(self):
358
+ key = "obsp"
359
+ if self._obsp is None:
360
+ self._obsp = CapAnnDataDict()
361
+
362
+ if key in self._file.keys():
363
+ self._link_array_mapping(cap_dict=self._obsp, key=key)
364
+
365
+ def _link_varp(self):
366
+ key = "varp"
367
+ if self._varp is None:
368
+ self._varp = CapAnnDataDict()
369
+
370
+ if key in self._file.keys():
371
+ self._link_array_mapping(cap_dict=self._varp, key=key)
372
+
373
+ def obsm_keys(self) -> List[str]:
374
+ return list(self.obsm.keys())
375
+
376
+ def obs_keys(self) -> List[str]:
377
+ return self.obs.column_order_array().tolist()
378
+
379
+ def var_keys(self) -> List[str]:
380
+ return self.var.column_order_array().tolist()
381
+
382
+ def field_to_entity(self, key):
383
+ if key == "obs":
384
+ return self.obs
385
+ elif key == "var":
386
+ return self.var
387
+ elif key == "raw.var":
388
+ return self.raw.var if self.raw is not None else None
389
+ elif key == "uns":
390
+ return self.uns
391
+ elif key == "layers":
392
+ return self.layers
393
+ elif key == "obsm":
394
+ return self.obsm
395
+ elif key == "varm":
396
+ return self.varm
397
+ elif key == "obsp":
398
+ return self.obsp
399
+ elif key == "varp":
400
+ return self.varp
401
+ else:
402
+ raise KeyError(
403
+ f"The field {key} is not supported! The list of supported fields are equal to {FIELDS_SUPPORTED_TO_OVERWRITE} "
404
+ f"attributes of the CapAnnData class."
405
+ )
406
+
407
+ def overwrite(self, fields: List[str] = None, compression: str = "lzf") -> None:
408
+ if fields is None:
409
+ fields = FIELDS_SUPPORTED_TO_OVERWRITE
410
+
411
+ for key in ["obs", "var", "raw.var"]:
412
+ if key in fields:
413
+ entity: CapAnnDataDF = self.field_to_entity(key)
414
+ if entity is None:
415
+ continue
416
+
417
+ key = key.replace(".", "/") if key == "raw.var" else key
418
+
419
+ for col in entity.columns:
420
+ self._write_elem(
421
+ f"{key}/{col}", entity[col].values, compression=compression
422
+ )
423
+
424
+ column_order = entity.column_order_array()
425
+ if (
426
+ column_order.size == 0
427
+ ): # Refs https://github.com/cellannotation/cap-anndata/issues/6
428
+ column_order = np.array([], dtype=np.float64)
429
+
430
+ # Index update
431
+ index_name = entity.index.name
432
+ if not index_name:
433
+ index_name = "_index"
434
+ self._file[key].attrs["_index"] = index_name
435
+ index_col = self._read_attr(self._file[key], "_index")
436
+ self._write_elem(
437
+ f"{key}/{index_col}", entity.index.to_numpy(), compression=compression
438
+ )
439
+
440
+ self._file[key].attrs["column-order"] = column_order
441
+
442
+ if "uns" in fields:
443
+ for key in self.uns.keys():
444
+ if self.uns[key] is not NotLinkedObject:
445
+ dest = f"uns/{key}"
446
+ self._write_elem(dest, self.uns[key], compression=compression)
447
+ for key in self.uns.keys_to_remove:
448
+ del self._file[f"uns/{key}"]
449
+
450
+ for field in ["layers", "obsm", "varm", "obsp", "varp"]:
451
+ if field in fields:
452
+ for key in self.field_to_entity(field).keys_to_remove:
453
+ del self._file[f"{field}/{key}"]
454
+
455
+ def create_layer(
456
+ self,
457
+ name: str,
458
+ matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
459
+ matrix_shape: Union[tuple[int, int], None] = None,
460
+ data_dtype: Union[np.dtype, None] = None,
461
+ format: Union[str, None] = None,
462
+ compression: str = "lzf",
463
+ ) -> None:
464
+ """
465
+ The empty layer will be created in the case of `matrix` is None.
466
+ """
467
+ self._create_new_matrix_in_field(
468
+ field="layers",
469
+ name=name,
470
+ matrix=matrix,
471
+ matrix_shape=matrix_shape,
472
+ data_dtype=data_dtype,
473
+ format=format,
474
+ compression=compression,
475
+ )
476
+ self._link_layers()
477
+
478
+ def create_obsm(
479
+ self,
480
+ name: str,
481
+ matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
482
+ matrix_shape: Union[tuple[int, int], None] = None,
483
+ data_dtype: Union[np.dtype, None] = None,
484
+ format: Union[str, None] = None,
485
+ compression: str = "lzf",
486
+ ) -> None:
487
+ self._create_new_matrix_in_field(
488
+ field="obsm",
489
+ name=name,
490
+ matrix=matrix,
491
+ matrix_shape=matrix_shape,
492
+ data_dtype=data_dtype,
493
+ format=format,
494
+ compression=compression,
495
+ )
496
+ self._link_obsm()
497
+
498
+ def create_varm(
499
+ self,
500
+ name: str,
501
+ matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
502
+ matrix_shape: Union[tuple[int, int], None] = None,
503
+ data_dtype: Union[np.dtype, None] = None,
504
+ format: Union[str, None] = None,
505
+ compression: str = "lzf",
506
+ ) -> None:
507
+ self._create_new_matrix_in_field(
508
+ field="varm",
509
+ name=name,
510
+ matrix=matrix,
511
+ matrix_shape=matrix_shape,
512
+ data_dtype=data_dtype,
513
+ format=format,
514
+ compression=compression,
515
+ )
516
+ self._link_varm()
517
+
518
+ def create_obsp(
519
+ self,
520
+ name: str,
521
+ matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
522
+ matrix_shape: Union[tuple[int, int], None] = None,
523
+ data_dtype: Union[np.dtype, None] = None,
524
+ format: Union[str, None] = None,
525
+ compression: str = "lzf",
526
+ ) -> None:
527
+ self._create_new_matrix_in_field(
528
+ field="obsp",
529
+ name=name,
530
+ matrix=matrix,
531
+ matrix_shape=matrix_shape,
532
+ data_dtype=data_dtype,
533
+ format=format,
534
+ compression=compression,
535
+ )
536
+ self._link_obsp()
537
+
538
+ def create_varp(
539
+ self,
540
+ name: str,
541
+ matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
542
+ matrix_shape: Union[tuple[int, int], None] = None,
543
+ data_dtype: Union[np.dtype, None] = None,
544
+ format: Union[str, None] = None,
545
+ compression: str = "lzf",
546
+ ) -> None:
547
+
548
+ self._create_new_matrix_in_field(
549
+ field="varp",
550
+ name=name,
551
+ matrix=matrix,
552
+ matrix_shape=matrix_shape,
553
+ data_dtype=data_dtype,
554
+ format=format,
555
+ compression=compression,
556
+ )
557
+ self._link_varp()
558
+
559
+ def _create_new_matrix_in_field(self, field, name, **kwargs):
560
+ """**kwargs: matrix, matrix_shape, data_dtype, format, compression"""
561
+ dest = f"{field}/{name}"
562
+ field_entity = getattr(self, field)
563
+ if name in field_entity.keys():
564
+ raise ValueError(
565
+ f"Please explicitly remove the existing '{name}' entity from {field} "
566
+ f"before creating a new one!"
567
+ )
568
+ if field not in self._file.keys():
569
+ self._file.create_group(field)
570
+ self._create_new_matrix(dest=dest, **kwargs)
571
+
572
+ def remove_layer(self, name: str) -> None:
573
+ del self._file[f"layers/{name}"]
574
+ self._link_layers()
575
+
576
+ def remove_obsp(self, name: str) -> None:
577
+ del self._file[f"obsp/{name}"]
578
+ self._link_obsp()
579
+
580
+ def remove_varp(self, name: str) -> None:
581
+ del self._file[f"varp/{name}"]
582
+ self._link_varp()
583
+
584
+ def remove_obsm(self, name: str) -> None:
585
+ del self._file[f"obsm/{name}"]
586
+ self._link_obsm()
587
+
588
+ def remove_varm(self, name: str) -> None:
589
+ del self._file[f"varm/{name}"]
590
+ self._link_varm()
591
+
592
+ def create_repr(self) -> str:
593
+ indent = " " * 4
594
+ s = f"CapAnnData object"
595
+ s += f"\n{indent}File: {self._file}"
596
+ s += f"\n{indent}X shape: {self.shape}"
597
+ s += f"\n{indent}Has raw X: {self.raw is not None}"
598
+ for field in ["obs", "obsm", "var", "uns", "layers"]:
599
+ if field in self._file:
600
+ in_memory = set()
601
+ if field in ["obs", "var", "uns"]:
602
+ attr = getattr(self, field)
603
+ if attr is not None:
604
+ in_memory = set(attr.keys())
605
+ keys = list(self._file[field].keys())
606
+ keys = [k for k in keys if k != "_index"]
607
+ keys = [(k if k not in in_memory else f"{k}*") for k in keys]
608
+ keys_str = str(keys).replace("*'", "'*")
609
+ s += f"\n{indent}{field}: {keys_str}"
610
+ s += f"\n{indent}Note: fields marked with * are in-memory objects."
611
+ return s
612
+
613
+ def __repr__(self) -> str:
614
+ return self.create_repr()
615
+
616
+ def __str__(self) -> str:
617
+ return self.create_repr()
618
+
619
+ def __enter__(self):
620
+ return self
621
+
622
+ def __exit__(self, *args):
623
+ if self._file is not None:
624
+ self._file.close()
625
+ logger.debug("CapAnnData closed!")