cap-anndata 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
cap_anndata/__init__.py CHANGED
@@ -1,5 +1,5 @@
1
1
  from .backed_df import CapAnnDataDF
2
- from .backed_uns import CapAnnDataUns
2
+ from .backed_dict import CapAnnDataDict
3
3
  from .cap_anndata import CapAnnData
4
4
  from .reader import (
5
5
  read_directly,
@@ -0,0 +1,34 @@
1
+ from typing import Set, Any
2
+
3
+
4
+ class CapAnnDataDict(dict):
5
+ __keys_to_remove: Set[str] = None
6
+
7
+ def __delitem__(self, __key: Any) -> None:
8
+ self.keys_to_remove.add(__key)
9
+ return super().__delitem__(__key)
10
+
11
+ def __setitem__(self, __key: Any, __value: Any) -> None:
12
+ if __value is not None:
13
+ if __key in self.keys_to_remove:
14
+ self.keys_to_remove.remove(__key)
15
+ else:
16
+ self.keys_to_remove.add(__key)
17
+ return super().__setitem__(__key, __value)
18
+
19
+ @property
20
+ def keys_to_remove(self) -> Set[str]:
21
+ if self.__keys_to_remove is None:
22
+ self.__keys_to_remove = set()
23
+ return self.__keys_to_remove
24
+
25
+ def pop(self, __key: Any, __default: Any = None) -> Any:
26
+ if __key in self:
27
+ self.keys_to_remove.add(__key)
28
+ return super().pop(__key, __default)
29
+
30
+ def popitem(self) -> Any:
31
+ item = super().popitem()
32
+ key = item[0]
33
+ self.keys_to_remove.add(key)
34
+ return item
@@ -2,16 +2,23 @@ import logging
2
2
  import anndata as ad
3
3
  import numpy as np
4
4
  import h5py
5
- from typing import List, Union, Dict, Tuple, Final
6
- from anndata._io.specs import read_elem, write_elem
5
+ from typing import List, Union, Any, Tuple, Final
6
+ import scipy.sparse as ss
7
+ from packaging import version
7
8
 
8
- from cap_anndata import CapAnnDataDF, CapAnnDataUns
9
+ if version.parse(ad.__version__) < version.parse("0.11.0"):
10
+ from anndata.experimental import sparse_dataset, read_elem, write_elem
11
+ else:
12
+ from anndata import sparse_dataset, read_elem, write_elem
9
13
 
14
+ from cap_anndata import CapAnnDataDF, CapAnnDataDict
10
15
 
11
16
  logger = logging.getLogger(__name__)
12
17
 
13
- X_NOTATION = Union[h5py.Dataset, ad.experimental.CSRDataset, ad.experimental.CSCDataset]
14
- OBSM_NOTATION = Dict[str, X_NOTATION]
18
+ X_NOTATION = Union[
19
+ h5py.Dataset, ad.experimental.CSRDataset, ad.experimental.CSCDataset, None
20
+ ]
21
+ ARRAY_MAPPING_NOTATION = CapAnnDataDict[str, X_NOTATION]
15
22
 
16
23
  NotLinkedObject: Final = "__NotLinkedObject"
17
24
 
@@ -22,6 +29,10 @@ class BaseLayerMatrixAndDf:
22
29
  self._path_to_content = path_to_content
23
30
  self._X: X_NOTATION = None
24
31
 
32
+ @property
33
+ def file(self) -> h5py.File:
34
+ return self._file
35
+
25
36
  @property
26
37
  def X(self) -> X_NOTATION:
27
38
  if self._X is None:
@@ -35,7 +46,7 @@ class BaseLayerMatrixAndDf:
35
46
  self._X = x
36
47
  else:
37
48
  # sparse dataset
38
- self._X = ad.experimental.sparse_dataset(x)
49
+ self._X = sparse_dataset(x)
39
50
 
40
51
  @property
41
52
  def shape(self) -> Tuple[int, int]:
@@ -76,6 +87,9 @@ class BaseLayerMatrixAndDf:
76
87
  # read whole df
77
88
  df = CapAnnDataDF.from_df(read_elem(h5_group), column_order=column_order)
78
89
  else:
90
+ if isinstance(columns, str):
91
+ # single column provided instead of list
92
+ columns = [columns]
79
93
  cols_to_read = [c for c in columns if c in column_order]
80
94
  df = CapAnnDataDF()
81
95
  df.column_order = column_order
@@ -92,7 +106,9 @@ class BaseLayerMatrixAndDf:
92
106
  return df
93
107
 
94
108
  def _write_elem(self, dest_key: str, elem: any, compression: str) -> None:
95
- write_elem(self._file, dest_key, elem, dataset_kwargs={"compression": compression})
109
+ write_elem(
110
+ self._file, dest_key, elem, dataset_kwargs={"compression": compression}
111
+ )
96
112
 
97
113
  def _validate_cap_df(self, cap_df: CapAnnDataDF, axis: int) -> None:
98
114
  if not isinstance(cap_df, CapAnnDataDF):
@@ -110,6 +126,64 @@ class BaseLayerMatrixAndDf:
110
126
  "AnnData object!"
111
127
  )
112
128
 
129
+ def _link_array_mapping(self, cap_dict: CapAnnDataDict, key: str) -> None:
130
+ """Method to update given cap_dict with backed array entities from the file."""
131
+ if key not in self._file.keys():
132
+ raise KeyError(f"The key {key} doesn't exist in the file! Ignore linking.")
133
+
134
+ group = self._file[key]
135
+ if not isinstance(group, h5py.Group):
136
+ raise ValueError(f"The object {key} must be a group!")
137
+
138
+ for array_name in group.keys():
139
+ array = group[array_name]
140
+ if isinstance(array, h5py.Dataset):
141
+ cap_dict[array_name] = array
142
+ elif isinstance(array, h5py.Group):
143
+ cap_dict[array_name] = sparse_dataset(array)
144
+ else:
145
+ raise ValueError(
146
+ f"Can't link array in {key} due to unsupported type of object: {type(array)}"
147
+ )
148
+
149
+ def _create_new_matrix(
150
+ self,
151
+ dest: str,
152
+ matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
153
+ matrix_shape: Union[tuple[int, int], None] = None,
154
+ data_dtype: Union[np.dtype, None] = None,
155
+ format: Union[str, None] = None, # TODO: use Enum instead of str
156
+ compression: str = "lzf",
157
+ ) -> None:
158
+ if matrix is not None:
159
+ self._write_elem(dest, matrix, compression=compression)
160
+ else:
161
+ if format == "dense":
162
+ group = self._file.create_dataset(
163
+ name=dest,
164
+ shape=matrix_shape,
165
+ dtype=data_dtype,
166
+ compression=compression,
167
+ )
168
+ # https://anndata.readthedocs.io/en/latest/fileformat-prose.html#dense-arrays-specification-v0-2-0
169
+ group.attrs["encoding-type"] = "array"
170
+ group.attrs["encoding-version"] = "0.2.0"
171
+ elif format in [
172
+ "csr",
173
+ "csc",
174
+ ]: # Based on https://github.com/appier/h5sparse/blob/master/h5sparse/h5sparse.py
175
+ if data_dtype is None:
176
+ data_dtype = np.float64
177
+ if matrix_shape is None:
178
+ matrix_shape = (0, 0)
179
+ sparse_class = ss.csr_matrix if format == "csr" else ss.csc_matrix
180
+ data = sparse_class(matrix_shape, dtype=data_dtype)
181
+ self._write_elem(dest, data, compression=compression)
182
+ else:
183
+ raise NotImplementedError(
184
+ f"Format must be 'dense', 'csr' or 'csc' but {format} given!"
185
+ )
186
+
113
187
 
114
188
  class RawLayer(BaseLayerMatrixAndDf):
115
189
  def __init__(self, h5_file: h5py.File):
@@ -143,8 +217,12 @@ class CapAnnData(BaseLayerMatrixAndDf):
143
217
  self._obs: CapAnnDataDF = None
144
218
  self._var: CapAnnDataDF = None
145
219
  self._X: X_NOTATION = None
146
- self._obsm: OBSM_NOTATION = None
147
- self._uns: CapAnnDataUns = None
220
+ self._obsm: CapAnnDataDict = None
221
+ self._varm: CapAnnDataDict = None
222
+ self._layers: CapAnnDataDict = None
223
+ self._uns: CapAnnDataDict = None
224
+ self._obsp: CapAnnDataDict = None
225
+ self._varp: CapAnnDataDict = None
148
226
  self._raw: RawLayer = None
149
227
  self._shape: Tuple[int, int] = None
150
228
 
@@ -170,12 +248,6 @@ class CapAnnData(BaseLayerMatrixAndDf):
170
248
  self._validate_cap_df(cap_df, axis=1)
171
249
  self._var = cap_df
172
250
 
173
- @property
174
- def obsm(self) -> OBSM_NOTATION:
175
- if self._obsm is None:
176
- self._link_obsm()
177
- return self._obsm
178
-
179
251
  @property
180
252
  def raw(self) -> RawLayer:
181
253
  if self._raw is None:
@@ -183,17 +255,51 @@ class CapAnnData(BaseLayerMatrixAndDf):
183
255
  logger.warning("Can't read raw.var since raw layer doesn't exist!")
184
256
  return
185
257
 
258
+ if len(self._file["raw"].keys()) == 0:
259
+ logger.warning("The raw layer is empty!")
260
+ return
261
+
186
262
  self._raw = RawLayer(self._file)
187
263
  return self._raw
188
264
 
189
265
  @property
190
- def uns(self) -> CapAnnDataUns:
266
+ def uns(self) -> CapAnnDataDict[str, Any]:
191
267
  if self._uns is None:
192
- self._uns = CapAnnDataUns(
268
+ self._uns = CapAnnDataDict(
193
269
  {k: NotLinkedObject for k in self._file["uns"].keys()}
194
270
  )
195
271
  return self._uns
196
272
 
273
+ @property
274
+ def layers(self) -> CapAnnDataDict[str, X_NOTATION]:
275
+ if self._layers is None:
276
+ self._link_layers()
277
+ return self._layers
278
+
279
+ @property
280
+ def obsm(self) -> CapAnnDataDict[str, X_NOTATION]:
281
+ if self._obsm is None:
282
+ self._link_obsm()
283
+ return self._obsm
284
+
285
+ @property
286
+ def varm(self) -> CapAnnDataDict[str, X_NOTATION]:
287
+ if self._varm is None:
288
+ self._link_varm()
289
+ return self._varm
290
+
291
+ @property
292
+ def obsp(self) -> CapAnnDataDict[str, X_NOTATION]:
293
+ if self._obsp is None:
294
+ self._link_obsp()
295
+ return self._obsp
296
+
297
+ @property
298
+ def varp(self) -> CapAnnDataDict[str, X_NOTATION]:
299
+ if self._varp is None:
300
+ self._link_varp()
301
+ return self._varp
302
+
197
303
  def read_obs(self, columns: List[str] = None, reset: bool = False) -> None:
198
304
  df = self._read_df("obs", columns=columns)
199
305
  if self.obs.empty or reset:
@@ -210,12 +316,72 @@ class CapAnnData(BaseLayerMatrixAndDf):
210
316
  for col in df.columns:
211
317
  self._var[col] = df[col]
212
318
 
319
+ def read_uns(self, keys: List[str] = None) -> None:
320
+ if keys is None:
321
+ keys = list(self.uns.keys())
322
+
323
+ for key in keys:
324
+ existing_keys = self.uns.keys()
325
+ if key in existing_keys:
326
+ source = self._file[f"uns/{key}"]
327
+ self.uns[key] = read_elem(source)
328
+
329
+ def _link_layers(self) -> None:
330
+ if self._layers is None:
331
+ self._layers = CapAnnDataDict()
332
+ if "layers" in self._file.keys():
333
+ self._link_array_mapping(cap_dict=self._layers, key="layers")
334
+
335
+ def _link_obsm(self) -> None:
336
+ key = "obsm"
337
+ if self._obsm is None:
338
+ self._obsm = CapAnnDataDict()
339
+ if key in self._file.keys():
340
+ self._link_array_mapping(cap_dict=self._obsm, key=key)
341
+
342
+ def _link_varm(self) -> None:
343
+ key = "varm"
344
+ if self._varm is None:
345
+ self._varm = CapAnnDataDict()
346
+ if key in self._file.keys():
347
+ self._link_array_mapping(cap_dict=self._varm, key=key)
348
+
349
+ def _link_obsp(self):
350
+ key = "obsp"
351
+ if self._obsp is None:
352
+ self._obsp = CapAnnDataDict()
353
+
354
+ if key in self._file.keys():
355
+ self._link_array_mapping(cap_dict=self._obsp, key=key)
356
+
357
+ def _link_varp(self):
358
+ key = "varp"
359
+ if self._varp is None:
360
+ self._varp = CapAnnDataDict()
361
+
362
+ if key in self._file.keys():
363
+ self._link_array_mapping(cap_dict=self._varp, key=key)
364
+
365
+ def obsm_keys(self) -> List[str]:
366
+ return list(self.obsm.keys())
367
+
368
+ def obs_keys(self) -> List[str]:
369
+ return self.obs.column_order.tolist()
370
+
371
+ def var_keys(self) -> List[str]:
372
+ return self.var.column_order.tolist()
373
+
213
374
  def overwrite(self, fields: List[str] = None, compression: str = "lzf") -> None:
214
375
  field_to_entity = {
215
376
  "obs": self.obs,
216
377
  "var": self.var,
217
378
  "raw.var": self.raw.var if self.raw is not None else None,
218
379
  "uns": self.uns,
380
+ "layers": self.layers,
381
+ "obsm": self.obsm,
382
+ "varm": self.varm,
383
+ "obsp": self.obsp,
384
+ "varp": self.varp,
219
385
  }
220
386
 
221
387
  if fields is None:
@@ -237,7 +403,9 @@ class CapAnnData(BaseLayerMatrixAndDf):
237
403
  key = key.replace(".", "/") if key == "raw.var" else key
238
404
 
239
405
  for col in entity.columns:
240
- self._write_elem(f"{key}/{col}", entity[col].values, compression=compression)
406
+ self._write_elem(
407
+ f"{key}/{col}", entity[col].values, compression=compression
408
+ )
241
409
 
242
410
  column_order = entity.column_order
243
411
  if (
@@ -254,34 +422,179 @@ class CapAnnData(BaseLayerMatrixAndDf):
254
422
  for key in self.uns.keys_to_remove:
255
423
  del self._file[f"uns/{key}"]
256
424
 
257
- def read_uns(self, keys: List[str] = None) -> None:
258
- if keys is None:
259
- keys = list(self.uns.keys())
260
-
261
- for key in keys:
262
- existing_keys = self.uns.keys()
263
- if key in existing_keys:
264
- source = self._file[f"uns/{key}"]
265
- self.uns[key] = read_elem(source)
266
-
267
- def _link_obsm(self) -> None:
268
- self._obsm = {}
269
- if "obsm" in self._file.keys():
270
- obsm_group = self._file["obsm"]
271
- for entity_name in obsm_group.keys():
272
- entity = obsm_group[entity_name]
273
- if isinstance(entity, h5py.Dataset):
274
- # dense array
275
- self._obsm[entity_name] = entity
276
- else:
277
- # sparse array
278
- self._obsm[entity_name] = ad.experimental.sparse_dataset(entity)
279
-
280
- def obsm_keys(self) -> List[str]:
281
- return list(self.obsm.keys())
282
-
283
- def obs_keys(self) -> List[str]:
284
- return self.obs.column_order.tolist()
285
-
286
- def var_keys(self) -> List[str]:
287
- return self.var.column_order.tolist()
425
+ for field in ["layers", "obsm", "varm", "obsp", "varp"]:
426
+ if field in fields:
427
+ for key in field_to_entity[field].keys_to_remove:
428
+ del self._file[f"{field}/{key}"]
429
+
430
+ def create_layer(
431
+ self,
432
+ name: str,
433
+ matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
434
+ matrix_shape: Union[tuple[int, int], None] = None,
435
+ data_dtype: Union[np.dtype, None] = None,
436
+ format: Union[str, None] = None,
437
+ compression: str = "lzf",
438
+ ) -> None:
439
+ """
440
+ The empty layer will be created in the case of `matrix` is None.
441
+ """
442
+ self._create_new_matrix_in_field(
443
+ field="layers",
444
+ name=name,
445
+ matrix=matrix,
446
+ matrix_shape=matrix_shape,
447
+ data_dtype=data_dtype,
448
+ format=format,
449
+ compression=compression,
450
+ )
451
+ self._link_layers()
452
+
453
+ def create_obsm(
454
+ self,
455
+ name: str,
456
+ matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
457
+ matrix_shape: Union[tuple[int, int], None] = None,
458
+ data_dtype: Union[np.dtype, None] = None,
459
+ format: Union[str, None] = None,
460
+ compression: str = "lzf",
461
+ ) -> None:
462
+ self._create_new_matrix_in_field(
463
+ field="obsm",
464
+ name=name,
465
+ matrix=matrix,
466
+ matrix_shape=matrix_shape,
467
+ data_dtype=data_dtype,
468
+ format=format,
469
+ compression=compression,
470
+ )
471
+ self._link_obsm()
472
+
473
+ def create_varm(
474
+ self,
475
+ name: str,
476
+ matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
477
+ matrix_shape: Union[tuple[int, int], None] = None,
478
+ data_dtype: Union[np.dtype, None] = None,
479
+ format: Union[str, None] = None,
480
+ compression: str = "lzf",
481
+ ) -> None:
482
+ self._create_new_matrix_in_field(
483
+ field="varm",
484
+ name=name,
485
+ matrix=matrix,
486
+ matrix_shape=matrix_shape,
487
+ data_dtype=data_dtype,
488
+ format=format,
489
+ compression=compression,
490
+ )
491
+ self._link_varm()
492
+
493
+ def create_obsp(
494
+ self,
495
+ name: str,
496
+ matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
497
+ matrix_shape: Union[tuple[int, int], None] = None,
498
+ data_dtype: Union[np.dtype, None] = None,
499
+ format: Union[str, None] = None,
500
+ compression: str = "lzf",
501
+ ) -> None:
502
+ self._create_new_matrix_in_field(
503
+ field="obsp",
504
+ name=name,
505
+ matrix=matrix,
506
+ matrix_shape=matrix_shape,
507
+ data_dtype=data_dtype,
508
+ format=format,
509
+ compression=compression,
510
+ )
511
+ self._link_obsp()
512
+
513
+ def create_varp(
514
+ self,
515
+ name: str,
516
+ matrix: Union[np.ndarray, ss.csr_matrix, ss.csc_matrix, None] = None,
517
+ matrix_shape: Union[tuple[int, int], None] = None,
518
+ data_dtype: Union[np.dtype, None] = None,
519
+ format: Union[str, None] = None,
520
+ compression: str = "lzf",
521
+ ) -> None:
522
+
523
+ self._create_new_matrix_in_field(
524
+ field="varp",
525
+ name=name,
526
+ matrix=matrix,
527
+ matrix_shape=matrix_shape,
528
+ data_dtype=data_dtype,
529
+ format=format,
530
+ compression=compression,
531
+ )
532
+ self._link_varp()
533
+
534
+ def _create_new_matrix_in_field(self, field, name, **kwargs):
535
+ """**kwargs: matrix, matrix_shape, data_dtype, format, compression"""
536
+ dest = f"{field}/{name}"
537
+ field_entity = getattr(self, field)
538
+ if name in field_entity.keys():
539
+ raise ValueError(
540
+ f"Please explicitly remove the existing '{name}' entity from {field} "
541
+ f"before creating a new one!"
542
+ )
543
+ if field not in self._file.keys():
544
+ self._file.create_group(field)
545
+ self._create_new_matrix(dest=dest, **kwargs)
546
+
547
+ def remove_layer(self, name: str) -> None:
548
+ del self._file[f"layers/{name}"]
549
+ self._link_layers()
550
+
551
+ def remove_obsp(self, name: str) -> None:
552
+ del self._file[f"obsp/{name}"]
553
+ self._link_obsp()
554
+
555
+ def remove_varp(self, name: str) -> None:
556
+ del self._file[f"varp/{name}"]
557
+ self._link_varp()
558
+
559
+ def remove_obsm(self, name: str) -> None:
560
+ del self._file[f"obsm/{name}"]
561
+ self._link_obsm()
562
+
563
+ def remove_varm(self, name: str) -> None:
564
+ del self._file[f"varm/{name}"]
565
+ self._link_varm()
566
+
567
+ def create_repr(self) -> str:
568
+ indent = " " * 4
569
+ s = f"CapAnnData object"
570
+ s += f"\n{indent}File: {self._file}"
571
+ s += f"\n{indent}X shape: {self.shape}"
572
+ s += f"\n{indent}Has raw X: {self.raw is not None}"
573
+ for field in ["obs", "obsm", "var", "uns", "layers"]:
574
+ if field in self._file:
575
+ in_memory = set()
576
+ if field in ["obs", "var", "uns"]:
577
+ attr = getattr(self, field)
578
+ if attr is not None:
579
+ in_memory = set(attr.keys())
580
+ keys = list(self._file[field].keys())
581
+ keys = [k for k in keys if k != "_index"]
582
+ keys = [(k if k not in in_memory else f"{k}*") for k in keys]
583
+ keys_str = str(keys).replace("*'", "'*")
584
+ s += f"\n{indent}{field}: {keys_str}"
585
+ s += f"\n{indent}Note: fields marked with * are in-memory objects."
586
+ return s
587
+
588
+ def __repr__(self) -> str:
589
+ return self.create_repr()
590
+
591
+ def __str__(self) -> str:
592
+ return self.create_repr()
593
+
594
+ def __enter__(self):
595
+ return self
596
+
597
+ def __exit__(self, *args):
598
+ if self._file is not None:
599
+ self._file.close()
600
+ logger.debug("CapAnnData closed!")
cap_anndata/reader.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import logging
2
- import contextlib
3
2
  import h5py
3
+ import warnings
4
4
 
5
5
  from cap_anndata import CapAnnData
6
6
 
@@ -8,7 +8,6 @@ from cap_anndata import CapAnnData
8
8
  logger = logging.getLogger(__name__)
9
9
 
10
10
 
11
- @contextlib.contextmanager
12
11
  def read_h5ad(file_path: str, edit: bool = False):
13
12
  """
14
13
  This is the main read method for CapAnnData.
@@ -21,24 +20,38 @@ def read_h5ad(file_path: str, edit: bool = False):
21
20
  file = h5py.File(file_path, mode)
22
21
  cap_adata = CapAnnData(file)
23
22
  logger.debug(f"Successfully read anndata file path {file_path}")
24
- yield cap_adata
23
+ return cap_adata
25
24
 
26
25
  except Exception as error:
27
- logger.error(f"Error during read anndata file at path: {file_path}, error = {error}!")
26
+ logger.error(
27
+ f"Error during read anndata file at path: {file_path}, error = {error}!"
28
+ )
28
29
  raise error
29
30
 
30
- finally:
31
- file.close()
32
- logger.debug("AnnData closed!")
33
31
 
32
+ def deprecated(message):
33
+ def deprecated_decorator(func):
34
+ def deprecated_func(*args, **kwargs):
35
+ warnings.warn(
36
+ "{} is a deprecated function. {}".format(func.__name__, message),
37
+ category=DeprecationWarning,
38
+ stacklevel=2,
39
+ )
40
+ warnings.simplefilter("default", DeprecationWarning)
41
+ return func(*args, **kwargs)
34
42
 
43
+ return deprecated_func
44
+
45
+ return deprecated_decorator
46
+
47
+
48
+ # TODO: remove deprecated function
49
+ @deprecated(
50
+ "It will be removed in the next version of package. Please replace it with `read_h5ad`."
51
+ )
35
52
  def read_directly(file_path: str, edit: bool = False) -> CapAnnData:
36
53
  """
37
54
  Must be used only in specific cases.
38
55
  User is responsible to close the h5py file when the work with CapAnnData instance done.
39
56
  """
40
- mode = "r+" if edit else "r"
41
- logger.debug(f"Read file {file_path} mode={mode} directly...")
42
- file = h5py.File(file_path, mode)
43
- cap_adata = CapAnnData(file)
44
- return cap_adata
57
+ return read_h5ad(file_path, edit)
@@ -0,0 +1,54 @@
1
+ Metadata-Version: 2.1
2
+ Name: cap_anndata
3
+ Version: 0.3.0
4
+ Summary: Partial read/write of AnnData (h5ad) files for low-memory operations with large datasets.
5
+ Home-page: https://github.com/cellannotation/cap-anndata
6
+ Author: R. Mukhin, A. Isaev
7
+ Author-email: roman@ebookapplications.com
8
+ Project-URL: Bug Tracker, https://github.com/cellannotation/cap-anndata/issues
9
+ Classifier: Programming Language :: Python :: 3.9
10
+ Classifier: License :: OSI Approved :: BSD License
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.9
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Requires-Dist: numpy >=1.23.5
16
+ Requires-Dist: pandas >=2.2.0
17
+ Requires-Dist: anndata >=0.10.0
18
+ Provides-Extra: dev
19
+ Requires-Dist: pytest >=8.0.0 ; extra == 'dev'
20
+ Requires-Dist: setuptools ~=69.1.1 ; extra == 'dev'
21
+
22
+ # CAP-AnnData: Partial I/O for AnnData (.h5ad) Files
23
+
24
+ ## Overview
25
+ CAP-AnnData offering functionalities for selective reading and writing of [AnnData](https://pypi.org/project/anndata/)
26
+ file fields without the need for loading entire dataset (or even entire field) into memory.
27
+ For example, it allows to read and modify the single `obs` column taking nothing into memory except the column itself.
28
+ Package eager to replicate the original AnnData API as much as possible,
29
+ while providing additional features for efficient data manipulation for heavy datasets.
30
+
31
+ ## Installation
32
+ Install CAP-AnnData via pip:
33
+
34
+ ```commandline
35
+ pip install -U cap-anndata
36
+ ```
37
+
38
+ ## Basic Example
39
+
40
+ The example below displayes how to read a single `obs` column, create new obs column and propagate it to the `.h5ad` file.
41
+ ```python
42
+ from cap_anndata import read_h5ad
43
+
44
+ file_path = "your_data.h5ad"
45
+ with read_h5ad(file_path=file_path, edit=True) as cap_adata:
46
+ print(cap_adata.obs_keys()) # ['a', 'b', 'c']
47
+ print(cap_adata.obs) # Empty DataFrame
48
+ cap_adata.read_obs(columns=['a'])
49
+ print(cap_adata.obs.columns) # ['a']
50
+ cap_adata.obs['new_col'] = cap_adata.obs['a']
51
+ cap_adata.overwrite(fields=['obs'])
52
+ ```
53
+
54
+ More example can be found in the [How-TO](https://github.com/cellannotation/cap-anndata/blob/main/HOWTO.md) file.
@@ -0,0 +1,10 @@
1
+ cap_anndata/__init__.py,sha256=m-iyYXl6oIgczQMXr_rqhoObblRAs37YYxMoWidm7i4,207
2
+ cap_anndata/backed_df.py,sha256=06wZwEjszFQ8lkvy6-GgD_SD14idu9857RtlfMQiBjE,2691
3
+ cap_anndata/backed_dict.py,sha256=jPJl7RxPxV7s5ywD23ZxkInWPrgValyKHmlKZplDuTE,1053
4
+ cap_anndata/cap_anndata.py,sha256=RDozLa-RZoNq_-CWNbrEoLbrNfaD8GkIU8vmAkxFuoQ,21197
5
+ cap_anndata/reader.py,sha256=yiY8kButhg5TDc_OcXNOZkJv5Bbdht3XOzswjgDogdQ,1666
6
+ cap_anndata-0.3.0.dist-info/LICENSE,sha256=JAV0w7TBl6wQe9iFcCKjAWgpurym0f-Q0B75zm2PrKw,1560
7
+ cap_anndata-0.3.0.dist-info/METADATA,sha256=Fj4jPwlPbFr_u-e8-cW2KX5H0bUyhiZ5wcNACGrwK9w,2172
8
+ cap_anndata-0.3.0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
9
+ cap_anndata-0.3.0.dist-info/top_level.txt,sha256=GKi_Uk4LUhXwWBfFCTIyJvEoJqFREt_4uH4CWgeLsg4,12
10
+ cap_anndata-0.3.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.42.0)
2
+ Generator: setuptools (75.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
cap_anndata/backed_uns.py DELETED
@@ -1,28 +0,0 @@
1
- from typing import List, Any
2
-
3
-
4
- class CapAnnDataUns(dict):
5
- __keys_to_remove: List[str] = []
6
-
7
- def __delitem__(self, __key: Any) -> None:
8
- self.__keys_to_remove.append(__key)
9
- return super().__delitem__(__key)
10
-
11
- def __setitem__(self, __key: Any, __value: Any) -> None:
12
- if __key in self.__keys_to_remove:
13
- self.__keys_to_remove.remove(__key)
14
- return super().__setitem__(__key, __value)
15
-
16
- @property
17
- def keys_to_remove(self):
18
- return self.__keys_to_remove
19
-
20
- def pop(self, __key: Any, __default: Any = None) -> Any:
21
- if __key in self:
22
- self.__keys_to_remove.append(__key)
23
- return super().pop(__key, __default)
24
-
25
- def popitem(self) -> Any:
26
- item = super().popitem()
27
- self.__keys_to_remove.append(item[0])
28
- return item
@@ -1,253 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: cap_anndata
3
- Version: 0.2.2
4
- Summary: Partial read/write of AnnData (h5ad) files for low-memory operations with large datasets.
5
- Home-page: https://github.com/cellannotation/cap-anndata
6
- Author: R. Mukhin, A. Isaev
7
- Author-email: roman@ebookapplications.com
8
- Project-URL: Bug Tracker, https://github.com/cellannotation/cap-anndata/issues
9
- Classifier: Programming Language :: Python :: 3.9
10
- Classifier: License :: OSI Approved :: BSD License
11
- Classifier: Operating System :: OS Independent
12
- Requires-Python: >=3.9
13
- Description-Content-Type: text/markdown
14
- License-File: LICENSE
15
- Requires-Dist: numpy ~=1.26.3
16
- Requires-Dist: pandas ~=2.2.0
17
- Requires-Dist: anndata ~=0.10.5
18
- Requires-Dist: h5py ~=3.5.0
19
- Provides-Extra: dev
20
- Requires-Dist: pytest >=8.0.0 ; extra == 'dev'
21
- Requires-Dist: setuptools ~=69.1.1 ; extra == 'dev'
22
-
23
- # CAP-AnnData: Enhanced Partial I/O for AnnData Files
24
-
25
- ## Overview
26
- CAP-AnnData enriches the AnnData ecosystem by offering tailored functionalities for partial reading and writing of AnnData files. This enhancement allows for selective manipulation of sections such as `obs`, `var`, `X`, `raw.X`, `obsm`, and `uns` without the need for loading entire datasets into memory. Leveraging AnnData's native methods, CAP-AnnData aims to maintain backward compatibility while improving efficiency, especially useful for large-scale single-cell genomics data.
27
-
28
- ## Getting Started
29
-
30
- ### Installation
31
- Install CAP-AnnData via pip:
32
-
33
- ```commandline
34
- pip install -U cap-anndata
35
- ```
36
-
37
- ### Running Tests
38
- Ensure the integrity and reliability of CAP-AnnData on your system by running the unit tests via `pytest` from the root of the repo.
39
-
40
- ```commandline
41
- pip install pytest
42
- pytest test
43
- ```
44
-
45
- Make sure Python 3.9 or newer is used, along with all requirements specified in requirements.txt
46
-
47
- ## How-TO:
48
-
49
- #### 1. Access AnnData File DataFrames
50
-
51
- ##### Basic Reading
52
- By default, `CapAnnData` does not automatically read any data. To begin working with dataframes, you need to explicitly read the data from the AnnData file. You can read the entire dataframe or select specific columns. For partial reading, provide a list of column names.
53
-
54
- ```python
55
- from cap_anndata import read_h5ad
56
-
57
- file_path = "your_data.h5ad"
58
- with read_h5ad(file_path=file_path, edit=False) as cap_adata:
59
- # Get the list of all obs columns in AnnData file
60
- cap_adata.obs_keys() # ['a', 'b', 'c']
61
- # Read all columns of 'obs'
62
- cap_adata.read_obs()
63
- # Get the list of columns of DataFrame in memory
64
- cap_adata.obs.columns # ['a', 'b', 'c']
65
-
66
- # Get the list of all var columns in AnnData file
67
- cap_adata.var_keys() # ['d', 'e', 'f']
68
- # Read specific columns of 'var'
69
- cap_adata.read_var(columns=['d'])
70
- cap_adata.var.columns # ['d']
71
- # Read additional column
72
- cap_adata.read_var(columns=['e'])
73
- cap_adata.var.columns # ['d', 'e']
74
-
75
- # Read column and reset the in-memory DataFrame before that
76
- cap_adata.read_var(columns=['f'], reset=True)
77
- cap_adata.var.columns # ['f']
78
-
79
- # Read no columns of raw.var (only the index)
80
- cap_adata.raw.read_var(columns=[])
81
- ```
82
-
83
- ##### Difference between `obs_keys()` and `obs.columns`
84
- `obs_keys()` returns the list of columns in the on-disc AnnData file, while `obs.columns` returns the list of columns in the in-memory DataFrame. The two lists may differ if you read only specific columns. If you modify the in-memory DataFrame, the `obs_keys()` will reflect the changes. BTW it is recommended to check the `obs_keys()` before the `overwrite()` call to avoid the AnnData file damage.
85
-
86
- If a column doesn't exist in the file, no error will be raised but the column will be missing in the resulting DataFrame. So, the list of columns saying more like "try to read this columns from the file". It is needed because we there is no way yet to check if the column exists before the read. Exactly the same behavior is for the `var_keys()` and `var.columns`.
87
-
88
- #### 2. Modify the AnnData File DataFrames In-Place
89
-
90
- You can directly modify the dataframe by adding, renaming, or removing columns.
91
-
92
- ```python
93
- # Create a new column
94
- cap_adata.obs['new_col'] = [value1, value2, value3]
95
-
96
- # Rename a column
97
- cap_adata.obs.rename_column('old_col_name', 'new_col_name')
98
-
99
- # Remove a column
100
- cap_adata.obs.remove_column('col_to_remove')
101
- ```
102
-
103
- After modifications, you can overwrite the changes back to the AnnData file. If a value doesn't exist, it will be created.
104
- Note: `read_h5ad` must be called with `edit=True` argument to open `.h5ad` file in `r+` mode.
105
-
106
- ```python
107
- # overwrite all values which were read
108
- cap_adata.overwrite()
109
-
110
- # overwrite choosen fields
111
- cap_adata.overwrite(['obs', 'var'])
112
- ```
113
-
114
- The full list of supported fields: `obs`, `var`, `raw.var`, `obsm`, `uns`.
115
-
116
- #### 3. How to Read Few Columns but Overwrite One in a Dataframe
117
-
118
- The only way yet to do that is to drop all columns from in-memory dataframe (with `pandas.drop`!) before the call of `overwrite` method.
119
-
120
- ```python
121
- # Read specific columns
122
- cap_adata.read_obs(columns=['cell_type', 'sample'])
123
-
124
- # Drop a column in-memory
125
- # DON'T USE remove_column here!
126
- cap_adata.obs.drop(columns='sample', inplace=True)
127
-
128
- # Overwrite changes
129
- cap_adata.overwrite(['obs'])
130
-
131
- # NOTE that the line
132
- # cap_adata.read_obs(columns=['sample'], reset=True)
133
- # Will override in-memory changes with values from the AnnData file
134
- ```
135
-
136
- #### 4. How to work with X and raw.X
137
-
138
- The CapAnnData package won't read any field by default. However, the `X` and `raw.X` will be linked to the backed matrices automatically upon the first request to those fields.
139
- The X object will be returned as the `h5py.Dataset` or `AnnData.experimental.sparse_dataset`.
140
-
141
- ```python
142
- with read_h5ad(file_path=file_path, edit=False) as cap_adata:
143
- # self.X is None here
144
- cap_adata = CapAnnData(file)
145
-
146
- # will return the h5py.Dataset or CSRDataset
147
- x = cap_adata.X
148
-
149
- # The same for raw.X
150
- raw_x = cap_adata.raw.X
151
-
152
- # take whole matrix in memory
153
- x = cap_adata.X[:]
154
- ```
155
-
156
- The CapAnnData supports the standard `numpy`/`h5py` sclising rules
157
-
158
- ```python
159
- # slice rows
160
- s_ = np.s_[0:5]
161
- # slice columns
162
- s_ = np.s_[:, 0:5]
163
- # boolean mask + slicing
164
- mask = np.array([i < 5 for i in range(adata.shape[0])])
165
- s_ = np.s_[mask, :5]
166
- ```
167
-
168
- #### 5. How to handle obsm embeddings matrixes
169
-
170
- By the default the CapAnnData will not read the embeddings matrix.
171
- The link to the h5py objects will be created upon the first call of the `.obsm` property.
172
- Alike the AnnData package the call like `cap_adata.obsm["X_tsne"]` will not return the in-memory matrix but will return the backed version instead.
173
- It is possible to get the information about the name and shape of the embeddings without taking the whole matrix in the memory.
174
-
175
- ```python
176
- with read_h5ad(file_path=file_path, edit=False) as cap_adata:
177
- # will return the list of strings
178
- obsm_keys = cap_adata.obsm_keys()
179
-
180
- # return the shape of the matrix in backed mode
181
- embeddings = obsm_keys[0]
182
- shape = cap_adata.obsm[embeddings].shape
183
-
184
- # take the whole matrix in memory
185
- matrix = cap_adata.obsm[embeddings][:]
186
- ```
187
-
188
- #### 6. How to read and modify uns section
189
-
190
- The `CapAnnData` class will lazely link the uns section upon the first call but ***WILL NOT*** read it into memory. Instead, the dictionary of the pairs `{'key': "__NotLinkedObject"}` will be creted. It allow to get the list of keys before the actual read. To read the uns section in the memory the `.read_uns(keys)` method must be called.
191
-
192
- ```python
193
- with read_h5ad(file_path=file_path, edit=True) as cap_adata:
194
- # will return the keys() object
195
- keys = cap_adata.uns.keys()
196
-
197
- # read in memory the first key only
198
- cap_adata.read_uns([keys[0]])
199
-
200
- # read the whole uns section into memory
201
- cap_adata.read_uns()
202
- ```
203
-
204
- Since the `.uns` section is in the memory (partially or completely) we can work with it as with the regular `dict()` python object. The main feature of the `CapAnnDataUns` class which inherited from `dict` is the tracking of the keys which must be removed from the `.h5ad` file upon overwrite.
205
-
206
- ```python
207
- # get the value
208
- v = cap_adata.uns["key1"]
209
- v = cap_adata.uns.get("key1")
210
-
211
- # modify values
212
- cap_adata.uns["key1"] = "new_value"
213
-
214
- # create new keys
215
- cap_adata.uns["new_key"] = "value"
216
-
217
- # remove keys
218
- cap_adata.uns.pop("key1") # is recommended way
219
- del cap_adata.uns.pop("key2")
220
- cap_adata.uns.popitem()
221
- ```
222
-
223
- To save `uns` changes the method `CapAnnData.overwrite()` must be called.
224
-
225
- ```python
226
- cap_adata.overwrite() # all in-memory fields will be overwritten
227
- cap_adata.overwrite(["uns"]) # overwrite the uns secion only
228
- ```
229
-
230
- #### 7. Join and Merge DataFrames
231
-
232
- Cap-AnnData provides enhanced methods for joining and merging dataframes, preserving column order and data integrity
233
-
234
- ```python
235
- from cap_anndata import CapAnnDataDF
236
- import pandas as pd
237
-
238
- data1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
239
- data2 = pd.DataFrame({'D': [7, 8, 9], 'E': [10, 11, 12]})
240
- cap_anndata_df1 = CapAnnDataDF.from_df(data1, column_order=['A', 'B', 'C'])
241
-
242
- cap_df = cap_anndata_df1.join(data2, how='left')
243
-
244
- cap_df.columns # ['A', 'B', 'D', 'E']
245
- cap_df.column_order # ['A', 'B', 'C', 'D', 'E']
246
-
247
- data3 = pd.DataFrame({'A': [2, 3, 4], 'D': [10, 11, 12]})
248
- cap_df = cap_anndata_df1.merge(data3, on='A')
249
-
250
- cap_df.columns # ['A', 'B', 'D']
251
- cap_df.column_order # ['A', 'B', 'C', 'D']
252
- cap_df.shape # (2, 3)
253
- ```
@@ -1,10 +0,0 @@
1
- cap_anndata/__init__.py,sha256=l9lvFpcMsQksp8_dI-fjUgrImoMdztbu3jVSdmxNPmA,205
2
- cap_anndata/backed_df.py,sha256=06wZwEjszFQ8lkvy6-GgD_SD14idu9857RtlfMQiBjE,2691
3
- cap_anndata/backed_uns.py,sha256=Tfxoz3RgcgENf4SvxFOox9w048K2QmBTh1VbAf4yqVI,854
4
- cap_anndata/cap_anndata.py,sha256=fEaIwWIKKDJpIsQ7cwOfUTmUReIyryv5qRDqRjRsWhU,10185
5
- cap_anndata/reader.py,sha256=kg9xoS_S0gY6WpsHE8PwGMa14VXh9Ibqjw4bwoerYsE,1267
6
- cap_anndata-0.2.2.dist-info/LICENSE,sha256=JAV0w7TBl6wQe9iFcCKjAWgpurym0f-Q0B75zm2PrKw,1560
7
- cap_anndata-0.2.2.dist-info/METADATA,sha256=h41dgoz3w2rDHnic828FahjEoKq1lt_Bi1jm-ZX-goA,9569
8
- cap_anndata-0.2.2.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
9
- cap_anndata-0.2.2.dist-info/top_level.txt,sha256=GKi_Uk4LUhXwWBfFCTIyJvEoJqFREt_4uH4CWgeLsg4,12
10
- cap_anndata-0.2.2.dist-info/RECORD,,