cap-anndata 0.3.1__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cap_anndata/backed_df.py +32 -20
- cap_anndata/cap_anndata.py +77 -48
- {cap_anndata-0.3.1.dist-info → cap_anndata-0.5.0.dist-info}/METADATA +21 -7
- cap_anndata-0.5.0.dist-info/RECORD +10 -0
- {cap_anndata-0.3.1.dist-info → cap_anndata-0.5.0.dist-info}/WHEEL +1 -1
- cap_anndata-0.3.1.dist-info/RECORD +0 -10
- {cap_anndata-0.3.1.dist-info → cap_anndata-0.5.0.dist-info/licenses}/LICENSE +0 -0
- {cap_anndata-0.3.1.dist-info → cap_anndata-0.5.0.dist-info}/top_level.txt +0 -0
cap_anndata/backed_df.py
CHANGED
@@ -1,13 +1,10 @@
|
|
1
1
|
import pandas as pd
|
2
2
|
import numpy as np
|
3
3
|
from typing import List, Any, Union
|
4
|
-
import logging
|
5
4
|
|
6
5
|
from pandas._typing import Self
|
7
6
|
from pandas.core.generic import bool_t
|
8
7
|
|
9
|
-
logger = logging.getLogger(__name__)
|
10
|
-
|
11
8
|
|
12
9
|
class CapAnnDataDF(pd.DataFrame):
|
13
10
|
"""
|
@@ -19,26 +16,37 @@ class CapAnnDataDF(pd.DataFrame):
|
|
19
16
|
|
20
17
|
_metadata = ["column_order"]
|
21
18
|
|
19
|
+
def column_order_array(self) -> np.array:
|
20
|
+
order = self.column_order
|
21
|
+
if order is not None and isinstance(order, List):
|
22
|
+
# Convert it to numpy array of str elements
|
23
|
+
return np.array(order, dtype=object)
|
24
|
+
else:
|
25
|
+
return order
|
26
|
+
|
22
27
|
def rename_column(self, old_name: str, new_name: str) -> None:
|
23
|
-
i = np.where(self.
|
24
|
-
|
28
|
+
i = np.where(self.column_order_array() == old_name)[0]
|
29
|
+
tmp_array = self.column_order_array().copy()
|
30
|
+
tmp_array[i] = new_name
|
31
|
+
self.column_order = tmp_array.copy()
|
25
32
|
self.rename(columns={old_name: new_name}, inplace=True)
|
26
33
|
|
27
34
|
def remove_column(self, col_name: str) -> None:
|
28
|
-
i = np.where(self.
|
29
|
-
self.column_order = np.delete(self.
|
35
|
+
i = np.where(self.column_order_array() == col_name)[0]
|
36
|
+
self.column_order = np.delete(self.column_order_array(), i)
|
30
37
|
self.drop(columns=[col_name], inplace=True)
|
31
38
|
|
32
39
|
def __setitem__(self, key, value) -> None:
|
33
|
-
if key not in self.
|
34
|
-
self.column_order = np.append(self.
|
40
|
+
if key not in self.column_order_array():
|
41
|
+
self.column_order = np.append(self.column_order_array(), key)
|
35
42
|
return super().__setitem__(key, value)
|
36
43
|
|
37
44
|
@classmethod
|
38
|
-
def from_df(cls, df: pd.DataFrame, column_order: List[str] = None) -> Self:
|
45
|
+
def from_df(cls, df: pd.DataFrame, column_order: Union[np.array, List[str], None] = None) -> Self:
|
39
46
|
if column_order is None:
|
40
47
|
column_order = df.columns.to_numpy()
|
41
|
-
|
48
|
+
elif isinstance(column_order, List):
|
49
|
+
column_order = np.array(column_order)
|
42
50
|
new_inst = cls(df)
|
43
51
|
new_inst.column_order = column_order
|
44
52
|
return new_inst
|
@@ -47,23 +55,27 @@ class CapAnnDataDF(pd.DataFrame):
|
|
47
55
|
result = super().join(other=other, **kwargs)
|
48
56
|
if isinstance(other, CapAnnDataDF):
|
49
57
|
new_columns = [
|
50
|
-
col for col in other.
|
58
|
+
col for col in other.column_order_array() if col not in self.column_order_array()
|
51
59
|
]
|
52
60
|
else:
|
53
|
-
new_columns = [col for col in other.columns if col not in self.
|
54
|
-
column_order = np.append(self.
|
55
|
-
|
61
|
+
new_columns = [col for col in other.columns if col not in self.column_order_array()]
|
62
|
+
column_order = np.append(self.column_order_array(), new_columns)
|
63
|
+
df = self.from_df(result, column_order=column_order)
|
64
|
+
return df
|
56
65
|
|
57
66
|
def merge(self, right, **kwargs) -> Self:
|
58
67
|
result = super().merge(right=right, **kwargs)
|
59
68
|
if isinstance(right, CapAnnDataDF):
|
60
69
|
new_columns = [
|
61
|
-
col for col in right.
|
70
|
+
col for col in right.column_order_array() if col not in self.column_order_array()
|
62
71
|
]
|
63
72
|
else:
|
64
|
-
new_columns = [col for col in right.columns if col not in self.
|
65
|
-
column_order = np.append(self.
|
66
|
-
|
73
|
+
new_columns = [col for col in right.columns if col not in self.column_order_array()]
|
74
|
+
column_order = np.append(self.column_order_array(), new_columns)
|
75
|
+
df = self.from_df(result, column_order=column_order)
|
76
|
+
return df
|
67
77
|
|
68
78
|
def copy(self, deep: Union[bool_t, None] = True) -> Self:
|
69
|
-
|
79
|
+
column_order = self.column_order_array()
|
80
|
+
df = self.from_df(super().copy(deep=deep), column_order=column_order)
|
81
|
+
return df
|
cap_anndata/cap_anndata.py
CHANGED
@@ -7,19 +7,33 @@ import scipy.sparse as ss
|
|
7
7
|
from packaging import version
|
8
8
|
|
9
9
|
if version.parse(ad.__version__) < version.parse("0.11.0"):
|
10
|
-
from anndata.experimental import
|
10
|
+
from anndata.experimental import (
|
11
|
+
sparse_dataset,
|
12
|
+
read_elem,
|
13
|
+
write_elem,
|
14
|
+
CSRDataset,
|
15
|
+
CSCDataset,
|
16
|
+
)
|
11
17
|
else:
|
12
|
-
from anndata.io import
|
18
|
+
from anndata.io import (
|
19
|
+
sparse_dataset,
|
20
|
+
read_elem,
|
21
|
+
write_elem,
|
22
|
+
)
|
23
|
+
from anndata.abc import (
|
24
|
+
CSRDataset,
|
25
|
+
CSCDataset,
|
26
|
+
)
|
13
27
|
|
14
28
|
from cap_anndata import CapAnnDataDF, CapAnnDataDict
|
15
29
|
|
16
30
|
logger = logging.getLogger(__name__)
|
17
31
|
|
18
32
|
X_NOTATION = Union[
|
19
|
-
h5py.Dataset,
|
33
|
+
h5py.Dataset, CSRDataset, CSCDataset, None
|
20
34
|
]
|
21
35
|
ARRAY_MAPPING_NOTATION = CapAnnDataDict[str, X_NOTATION]
|
22
|
-
|
36
|
+
FIELDS_SUPPORTED_TO_OVERWRITE = ["obs", "var", "raw.var", "uns", "layers", "obsm", "varm", "obsp", "varp"]
|
23
37
|
NotLinkedObject: Final = "__NotLinkedObject"
|
24
38
|
|
25
39
|
|
@@ -57,15 +71,7 @@ class BaseLayerMatrixAndDf:
|
|
57
71
|
return shape
|
58
72
|
|
59
73
|
def _lazy_df_load(self, key: str) -> CapAnnDataDF:
|
60
|
-
|
61
|
-
attribute = self._path_to_content + key
|
62
|
-
column_order = self._read_attr(self._file[attribute], "column-order")
|
63
|
-
df.column_order = column_order
|
64
|
-
if df.column_order.dtype != object:
|
65
|
-
# empty DataFrame will have column_order as float64
|
66
|
-
# which leads to failure in overwrite method
|
67
|
-
df.column_order = df.column_order.astype(object)
|
68
|
-
return df
|
74
|
+
return self._read_df(key=key, columns=[])
|
69
75
|
|
70
76
|
@staticmethod
|
71
77
|
def _read_attr(obj: Union[h5py.Group, h5py.Dataset], attr_name: str) -> any:
|
@@ -93,8 +99,10 @@ class BaseLayerMatrixAndDf:
|
|
93
99
|
cols_to_read = [c for c in columns if c in column_order]
|
94
100
|
df = CapAnnDataDF()
|
95
101
|
df.column_order = column_order
|
102
|
+
|
96
103
|
index_col = self._read_attr(h5_group, "_index")
|
97
|
-
|
104
|
+
index = read_elem(h5_group[index_col])
|
105
|
+
df.index = index
|
98
106
|
|
99
107
|
for col in cols_to_read:
|
100
108
|
df[col] = read_elem(h5_group[col])
|
@@ -135,15 +143,19 @@ class BaseLayerMatrixAndDf:
|
|
135
143
|
if not isinstance(group, h5py.Group):
|
136
144
|
raise ValueError(f"The object {key} must be a group!")
|
137
145
|
|
138
|
-
for
|
139
|
-
|
140
|
-
if isinstance(
|
141
|
-
cap_dict[
|
142
|
-
elif isinstance(
|
143
|
-
|
146
|
+
for entity_name in group.keys():
|
147
|
+
entity = group[entity_name]
|
148
|
+
if isinstance(entity, h5py.Dataset):
|
149
|
+
cap_dict[entity_name] = entity
|
150
|
+
elif isinstance(entity, h5py.Group):
|
151
|
+
enc_type = dict(entity.attrs).get("encoding-type")
|
152
|
+
if enc_type == "dataframe":
|
153
|
+
cap_dict[entity_name] = self._read_df(key="/".join([key, entity_name]), columns=None)
|
154
|
+
elif enc_type in ["csc_matrix", "csr_matrix"]:
|
155
|
+
cap_dict[entity_name] = sparse_dataset(entity)
|
144
156
|
else:
|
145
157
|
raise ValueError(
|
146
|
-
f"Can't link array in {key} due to unsupported type of object: {type(
|
158
|
+
f"Can't link array in {key} due to unsupported type of object: {type(entity)}"
|
147
159
|
)
|
148
160
|
|
149
161
|
def _create_new_matrix(
|
@@ -252,11 +264,11 @@ class CapAnnData(BaseLayerMatrixAndDf):
|
|
252
264
|
def raw(self) -> RawLayer:
|
253
265
|
if self._raw is None:
|
254
266
|
if "raw" not in self._file.keys():
|
255
|
-
logger.
|
267
|
+
logger.debug("Can't read raw.var since raw layer doesn't exist!")
|
256
268
|
return
|
257
269
|
|
258
270
|
if len(self._file["raw"].keys()) == 0:
|
259
|
-
logger.
|
271
|
+
logger.debug("The raw layer is empty!")
|
260
272
|
return
|
261
273
|
|
262
274
|
self._raw = RawLayer(self._file)
|
@@ -366,37 +378,43 @@ class CapAnnData(BaseLayerMatrixAndDf):
|
|
366
378
|
return list(self.obsm.keys())
|
367
379
|
|
368
380
|
def obs_keys(self) -> List[str]:
|
369
|
-
return self.obs.
|
381
|
+
return self.obs.column_order_array().tolist()
|
370
382
|
|
371
383
|
def var_keys(self) -> List[str]:
|
372
|
-
return self.var.
|
384
|
+
return self.var.column_order_array().tolist()
|
385
|
+
|
386
|
+
def field_to_entity(self, key):
|
387
|
+
if key == "obs":
|
388
|
+
return self.obs
|
389
|
+
elif key == "var":
|
390
|
+
return self.var
|
391
|
+
elif key == "raw.var":
|
392
|
+
return self.raw.var if self.raw is not None else None
|
393
|
+
elif key == "uns":
|
394
|
+
return self.uns
|
395
|
+
elif key == "layers":
|
396
|
+
return self.layers
|
397
|
+
elif key == "obsm":
|
398
|
+
return self.obsm
|
399
|
+
elif key == "varm":
|
400
|
+
return self.varm
|
401
|
+
elif key == "obsp":
|
402
|
+
return self.obsp
|
403
|
+
elif key == "varp":
|
404
|
+
return self.varp
|
405
|
+
else:
|
406
|
+
raise KeyError(
|
407
|
+
f"The field {key} is not supported! The list of supported fields are equal to {FIELDS_SUPPORTED_TO_OVERWRITE} "
|
408
|
+
f"attributes of the CapAnnData class."
|
409
|
+
)
|
373
410
|
|
374
411
|
def overwrite(self, fields: List[str] = None, compression: str = "lzf") -> None:
|
375
|
-
field_to_entity = {
|
376
|
-
"obs": self.obs,
|
377
|
-
"var": self.var,
|
378
|
-
"raw.var": self.raw.var if self.raw is not None else None,
|
379
|
-
"uns": self.uns,
|
380
|
-
"layers": self.layers,
|
381
|
-
"obsm": self.obsm,
|
382
|
-
"varm": self.varm,
|
383
|
-
"obsp": self.obsp,
|
384
|
-
"varp": self.varp,
|
385
|
-
}
|
386
|
-
|
387
412
|
if fields is None:
|
388
|
-
fields =
|
389
|
-
else:
|
390
|
-
for f in fields:
|
391
|
-
if f not in field_to_entity.keys():
|
392
|
-
raise KeyError(
|
393
|
-
f"The field {f} is not supported! The list of supported fields are equal to supported "
|
394
|
-
f"attributes of the CapAnnData class: obs, var, raw.var and uns."
|
395
|
-
)
|
413
|
+
fields = FIELDS_SUPPORTED_TO_OVERWRITE
|
396
414
|
|
397
415
|
for key in ["obs", "var", "raw.var"]:
|
398
416
|
if key in fields:
|
399
|
-
entity: CapAnnDataDF = field_to_entity
|
417
|
+
entity: CapAnnDataDF = self.field_to_entity(key)
|
400
418
|
if entity is None:
|
401
419
|
continue
|
402
420
|
|
@@ -407,11 +425,22 @@ class CapAnnData(BaseLayerMatrixAndDf):
|
|
407
425
|
f"{key}/{col}", entity[col].values, compression=compression
|
408
426
|
)
|
409
427
|
|
410
|
-
column_order = entity.
|
428
|
+
column_order = entity.column_order_array()
|
411
429
|
if (
|
412
430
|
column_order.size == 0
|
413
431
|
): # Refs https://github.com/cellannotation/cap-anndata/issues/6
|
414
432
|
column_order = np.array([], dtype=np.float64)
|
433
|
+
|
434
|
+
# Index update
|
435
|
+
index_name = entity.index.name
|
436
|
+
if not index_name:
|
437
|
+
index_name = "_index"
|
438
|
+
self._file[key].attrs["_index"] = index_name
|
439
|
+
index_col = self._read_attr(self._file[key], "_index")
|
440
|
+
self._write_elem(
|
441
|
+
f"{key}/{index_col}", entity.index.to_numpy(), compression=compression
|
442
|
+
)
|
443
|
+
|
415
444
|
self._file[key].attrs["column-order"] = column_order
|
416
445
|
|
417
446
|
if "uns" in fields:
|
@@ -424,7 +453,7 @@ class CapAnnData(BaseLayerMatrixAndDf):
|
|
424
453
|
|
425
454
|
for field in ["layers", "obsm", "varm", "obsp", "varp"]:
|
426
455
|
if field in fields:
|
427
|
-
for key in field_to_entity
|
456
|
+
for key in self.field_to_entity(field).keys_to_remove:
|
428
457
|
del self._file[f"{field}/{key}"]
|
429
458
|
|
430
459
|
def create_layer(
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: cap_anndata
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.5.0
|
4
4
|
Summary: Partial read/write of AnnData (h5ad) files for low-memory operations with large datasets.
|
5
5
|
Home-page: https://github.com/cellannotation/cap-anndata
|
6
6
|
Author: R. Mukhin, A. Isaev
|
@@ -14,15 +14,29 @@ Classifier: Operating System :: OS Independent
|
|
14
14
|
Requires-Python: >=3.9
|
15
15
|
Description-Content-Type: text/markdown
|
16
16
|
License-File: LICENSE
|
17
|
-
Requires-Dist: numpy
|
18
|
-
Requires-Dist: pandas
|
19
|
-
Requires-Dist: anndata
|
17
|
+
Requires-Dist: numpy>=1.23.5
|
18
|
+
Requires-Dist: pandas>=2.2.0
|
19
|
+
Requires-Dist: anndata>=0.10.0
|
20
20
|
Provides-Extra: dev
|
21
|
-
Requires-Dist: pytest
|
22
|
-
Requires-Dist: setuptools
|
21
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
22
|
+
Requires-Dist: setuptools~=69.1.1; extra == "dev"
|
23
|
+
Dynamic: author
|
24
|
+
Dynamic: author-email
|
25
|
+
Dynamic: classifier
|
26
|
+
Dynamic: description
|
27
|
+
Dynamic: description-content-type
|
28
|
+
Dynamic: home-page
|
29
|
+
Dynamic: license-file
|
30
|
+
Dynamic: project-url
|
31
|
+
Dynamic: provides-extra
|
32
|
+
Dynamic: requires-dist
|
33
|
+
Dynamic: requires-python
|
34
|
+
Dynamic: summary
|
23
35
|
|
24
36
|
# CAP-AnnData: Partial I/O for AnnData (.h5ad) Files
|
25
37
|
|
38
|
+
[](https://pypi.org/project/cap-anndata/) [](https://github.com/cellannotation/cap-anndata/actions)
|
39
|
+
|
26
40
|
## Overview
|
27
41
|
CAP-AnnData offering functionalities for selective reading and writing of [AnnData](https://pypi.org/project/anndata/)
|
28
42
|
file fields without the need for loading entire dataset (or even entire field) into memory.
|
@@ -0,0 +1,10 @@
|
|
1
|
+
cap_anndata/__init__.py,sha256=WRAQEDsWTvLbJWVUA5FmKCVrD2GN4oRd5I3c8jc9ajo,197
|
2
|
+
cap_anndata/backed_df.py,sha256=2OVomvTY51V05sYwEXg-4JYBgd9iJCA2-Lt7nEAL1Ug,3255
|
3
|
+
cap_anndata/backed_dict.py,sha256=Hb1SjnKuQ13mBUitQ5sL3kmcQ1j3GgB19r3yXkC0oIo,1019
|
4
|
+
cap_anndata/cap_anndata.py,sha256=4sro4BIsaOuTBHrRXYCi0WlGtxsql_bnqIDEpT2tRhQ,21371
|
5
|
+
cap_anndata/reader.py,sha256=UpZBCjaS4-K2w_9m6IuYetO9LwmEEJ5KvAw9aAoMRno,1609
|
6
|
+
cap_anndata-0.5.0.dist-info/licenses/LICENSE,sha256=XXTH6JikkxH7Gqy9VEj4crSizuwxzv04ROzkQ-ZS6o4,1532
|
7
|
+
cap_anndata-0.5.0.dist-info/METADATA,sha256=CbdJemeEOB1hIJ7tPrVOT7JldkVNOiJ6zkW8AFqTjqU,2825
|
8
|
+
cap_anndata-0.5.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
9
|
+
cap_anndata-0.5.0.dist-info/top_level.txt,sha256=GKi_Uk4LUhXwWBfFCTIyJvEoJqFREt_4uH4CWgeLsg4,12
|
10
|
+
cap_anndata-0.5.0.dist-info/RECORD,,
|
@@ -1,10 +0,0 @@
|
|
1
|
-
cap_anndata/__init__.py,sha256=WRAQEDsWTvLbJWVUA5FmKCVrD2GN4oRd5I3c8jc9ajo,197
|
2
|
-
cap_anndata/backed_df.py,sha256=bMNsArbPjA-TN7eQB4-9Y2l3s8o03-dM4hPnOR9tROc,2622
|
3
|
-
cap_anndata/backed_dict.py,sha256=Hb1SjnKuQ13mBUitQ5sL3kmcQ1j3GgB19r3yXkC0oIo,1019
|
4
|
-
cap_anndata/cap_anndata.py,sha256=uQh49Kwu2cE4-ebgOvb78mMGA_afkZcsr71j6f8EX2I,20600
|
5
|
-
cap_anndata/reader.py,sha256=UpZBCjaS4-K2w_9m6IuYetO9LwmEEJ5KvAw9aAoMRno,1609
|
6
|
-
cap_anndata-0.3.1.dist-info/LICENSE,sha256=XXTH6JikkxH7Gqy9VEj4crSizuwxzv04ROzkQ-ZS6o4,1532
|
7
|
-
cap_anndata-0.3.1.dist-info/METADATA,sha256=688YuF45IuOvu1Hqxbt_O1aeYkoMX4tjV0b2hb1WY8I,2304
|
8
|
-
cap_anndata-0.3.1.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
|
9
|
-
cap_anndata-0.3.1.dist-info/top_level.txt,sha256=GKi_Uk4LUhXwWBfFCTIyJvEoJqFREt_4uH4CWgeLsg4,12
|
10
|
-
cap_anndata-0.3.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|