cap-anndata 0.1.0__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- cap_anndata-0.1.0/LICENSE +28 -0
- cap_anndata-0.1.0/PKG-INFO +199 -0
- cap_anndata-0.1.0/README.md +179 -0
- cap_anndata-0.1.0/cap_anndata/__init__.py +6 -0
- cap_anndata-0.1.0/cap_anndata/backed_df.py +40 -0
- cap_anndata-0.1.0/cap_anndata/backed_uns.py +28 -0
- cap_anndata-0.1.0/cap_anndata/cap_anndata.py +221 -0
- cap_anndata-0.1.0/cap_anndata.egg-info/PKG-INFO +199 -0
- cap_anndata-0.1.0/cap_anndata.egg-info/SOURCES.txt +15 -0
- cap_anndata-0.1.0/cap_anndata.egg-info/dependency_links.txt +1 -0
- cap_anndata-0.1.0/cap_anndata.egg-info/requires.txt +6 -0
- cap_anndata-0.1.0/cap_anndata.egg-info/top_level.txt +1 -0
- cap_anndata-0.1.0/setup.cfg +4 -0
- cap_anndata-0.1.0/setup.py +28 -0
- cap_anndata-0.1.0/test/test_backed_df.py +58 -0
- cap_anndata-0.1.0/test/test_backed_uns.py +36 -0
- cap_anndata-0.1.0/test/test_cap_anndata.py +291 -0
@@ -0,0 +1,28 @@
|
|
1
|
+
BSD 3-Clause License
|
2
|
+
|
3
|
+
Copyright (c) 2024, R. Mukhin, A. Isaev, Cell-Annotation Platform
|
4
|
+
|
5
|
+
Redistribution and use in source and binary forms, with or without
|
6
|
+
modification, are permitted provided that the following conditions are met:
|
7
|
+
|
8
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
9
|
+
list of conditions and the following disclaimer.
|
10
|
+
|
11
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
12
|
+
this list of conditions and the following disclaimer in the documentation
|
13
|
+
and/or other materials provided with the distribution.
|
14
|
+
|
15
|
+
3. Neither the name of the copyright holder nor the names of its
|
16
|
+
contributors may be used to endorse or promote products derived from
|
17
|
+
this software without specific prior written permission.
|
18
|
+
|
19
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
20
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
21
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
22
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
23
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
24
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
25
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
26
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
27
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
28
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
@@ -0,0 +1,199 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: cap_anndata
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: Partial read of AnnData files for low-memory operations with large datasets.
|
5
|
+
Home-page: https://github.com/cellannotation/cap-anndata
|
6
|
+
Author: R. Mukhin, A. Isaev
|
7
|
+
Author-email: roman@ebookapplications.com
|
8
|
+
Project-URL: Bug Tracker, https://github.com/cellannotation/cap-anndata/issues
|
9
|
+
Classifier: Programming Language :: Python :: 3.9
|
10
|
+
Classifier: License :: OSI Approved :: BSD License
|
11
|
+
Classifier: Operating System :: OS Independent
|
12
|
+
Requires-Python: >=3.9
|
13
|
+
Description-Content-Type: text/markdown
|
14
|
+
License-File: LICENSE
|
15
|
+
Requires-Dist: numpy>=1.26.3
|
16
|
+
Requires-Dist: pandas>=2.2.0
|
17
|
+
Requires-Dist: anndata>=0.10.5
|
18
|
+
Provides-Extra: dev
|
19
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
20
|
+
|
21
|
+
# CAP-AnnData: Enhanced Partial I/O for AnnData Files
|
22
|
+
|
23
|
+
## Overview
|
24
|
+
CAP-AnnData enriches the AnnData ecosystem by offering tailored functionalities for partial reading and writing of AnnData files. This enhancement allows for selective manipulation of sections such as `obs`, `var`, `X`, `raw.X`, `obsm`, and `uns` without the need for loading entire datasets into memory. Leveraging AnnData's native methods, CAP-AnnData aims to maintain backward compatibility while improving efficiency, especially useful for large-scale single-cell genomics data.
|
25
|
+
|
26
|
+
## Getting Started
|
27
|
+
|
28
|
+
### Running Tests
|
29
|
+
Ensure the integrity and reliability of CAP-AnnData on your system by running the unit tests in `test/unit_test.py`.
|
30
|
+
|
31
|
+
Make sure Python 3.9 or newer is used, along with all requirements specified in requirements.txt
|
32
|
+
|
33
|
+
## How-TO:
|
34
|
+
|
35
|
+
#### 1. Read AnnData File Dataframes
|
36
|
+
|
37
|
+
##### Basic Reading
|
38
|
+
By default, `CapAnnData` does not automatically read any data. To begin working with dataframes, you need to explicitly read the data from the AnnData file. You can read the entire dataframe or select specific columns. For partial reading, provide a list of column names.
|
39
|
+
|
40
|
+
```python
|
41
|
+
import h5py
|
42
|
+
from cap_anndata import CapAnnData
|
43
|
+
|
44
|
+
file_path = "your_data.h5ad"
|
45
|
+
with h5py.File(file_path, 'r') as file:
|
46
|
+
cap_adata = CapAnnData(file)
|
47
|
+
|
48
|
+
# Read all columns of 'obs'
|
49
|
+
cap_adata.read_obs()
|
50
|
+
|
51
|
+
# Read specific columns of 'var'
|
52
|
+
cap_adata.read_var(columns=['gene_expression', 'dispersion'])
|
53
|
+
|
54
|
+
# Read all columns of raw.var
|
55
|
+
cap_adata.read_var(raw=True)
|
56
|
+
```
|
57
|
+
|
58
|
+
##### Non-existing columns
|
59
|
+
|
60
|
+
If a column doesn't exist in the file, no error will be raised but the column will be missing in the resulting Dataframe. So, the list of columns saying more like "try to read this columns from the file". It is needed because we there is no way yet to check if the column exists before the read.
|
61
|
+
|
62
|
+
#### 2. Modify the AnnData File Dataframes In-Place
|
63
|
+
|
64
|
+
You can directly modify the dataframe by adding, renaming, or removing columns.
|
65
|
+
|
66
|
+
```python
|
67
|
+
# Create a new column
|
68
|
+
cap_adata.obs['new_col'] = [value1, value2, value3]
|
69
|
+
|
70
|
+
# Rename a column
|
71
|
+
cap_adata.rename_column('old_col_name', 'new_col_name')
|
72
|
+
|
73
|
+
# Remove a column
|
74
|
+
cap_adata.remove_column('col_to_remove')
|
75
|
+
```
|
76
|
+
|
77
|
+
After modifications, you can overwrite the changes back to the AnnData file. If a value doesn't exist, it will be created.
|
78
|
+
|
79
|
+
```python
|
80
|
+
# overwrite all values which were read
|
81
|
+
cap_adata.overwrite()
|
82
|
+
|
83
|
+
# overwrite choosen fields
|
84
|
+
cap_adata.overwrite(['obs', 'var'])
|
85
|
+
```
|
86
|
+
|
87
|
+
The full list of supported fields: `X`, `raw.X`, `obs`, `var`, `raw.var`, `obsm`, `uns`.
|
88
|
+
|
89
|
+
#### 3. How to Read Few Columns but Overwrite One in a Dataframe
|
90
|
+
|
91
|
+
The only way yet to do that is to drop all columns from in-memory dataframe (with `pandas.drop`!) before the call of `overwrite` method.
|
92
|
+
|
93
|
+
```python
|
94
|
+
# Read specific columns
|
95
|
+
cap_adata.read_obs(columns=['cell_type', 'sample'])
|
96
|
+
|
97
|
+
# Drop a column in-memory
|
98
|
+
# DON'T USE remove_column here!
|
99
|
+
cap_adata.obs.drop(columns='sample', inplace=True)
|
100
|
+
|
101
|
+
# Overwrite changes
|
102
|
+
cap_adata.overwrite(['obs'])
|
103
|
+
```
|
104
|
+
|
105
|
+
#### 4. How to work with X and raw.X
|
106
|
+
|
107
|
+
The CapAnnData package won't read any field by default. However, the `X` and `raw.X` will be linked to the backed matrices automatically upon the first request to those fields.
|
108
|
+
|
109
|
+
```python
|
110
|
+
with h5py.File(path) as file:
|
111
|
+
# self.X is None here
|
112
|
+
cap_adata = CapAnnData(file)
|
113
|
+
|
114
|
+
# will return the h5py.Dataset or CSRDataset
|
115
|
+
x = cap_adata.X
|
116
|
+
|
117
|
+
# The same for raw.X
|
118
|
+
raw_x = cap_adata.raw.X
|
119
|
+
|
120
|
+
# take whole matrix in memory
|
121
|
+
x = cap_adata.X[:]
|
122
|
+
```
|
123
|
+
|
124
|
+
The CapAnnData supports the standard `numpy`/`h5py` sclising rules
|
125
|
+
|
126
|
+
```python
|
127
|
+
# slice rows
|
128
|
+
s_ = np.s_[0:5]
|
129
|
+
# slice columns
|
130
|
+
s_ = np.s_[:, 0:5]
|
131
|
+
# boolean mask + slicing
|
132
|
+
mask = np.array([i < 5 for i in range(adata.shape[0])])
|
133
|
+
s_ = np.s_[mask, :5]
|
134
|
+
```
|
135
|
+
|
136
|
+
#### 5. How to handle obsm embeddings matrixes
|
137
|
+
|
138
|
+
By the default the CapAnnData will not read the embeddings matrix. The link to the h5py objects will be created upon the first call of the `.obsm` property. Alike the AnnData package the call like `cap_adata.obsm["X_tsne"]` will not return the in-memory matrix but will return the backed version instead. We can get the information about the name and shape of the embeddings without taking the whole matrixes in the memory!
|
139
|
+
|
140
|
+
```python
|
141
|
+
with h5py.File(path) as file:
|
142
|
+
# initialization
|
143
|
+
cap_adata = CapAnnData(file)
|
144
|
+
|
145
|
+
# will return the list of strings
|
146
|
+
obsm_keys = cap_adata.obsm_keys()
|
147
|
+
|
148
|
+
# return the shape of the matrix in backed mode
|
149
|
+
embeddings = obsm_keys[0]
|
150
|
+
shape = cap_adata.obsm[embeddings].shape
|
151
|
+
|
152
|
+
# take the whole matrix in memory
|
153
|
+
matrix = cap_adata.obsm[embeddings][:]
|
154
|
+
```
|
155
|
+
|
156
|
+
#### 6. How to read and modify uns section
|
157
|
+
|
158
|
+
The `CapAnnData` class will lazely link the uns section upon the first call but ***WILL NOT*** read it into memory. Instead, the dictionary of the pairs `{'key': "__NotLinkedObject"}` will be creted. It allow to get the list of keys before the actual read. To read the uns section in the memory the `.read_uns(keys)` method must be called.
|
159
|
+
|
160
|
+
```python
|
161
|
+
with h5py.File(path) as file:
|
162
|
+
# initialization
|
163
|
+
cap_adata = CapAnnData(file)
|
164
|
+
|
165
|
+
# will return the keys() object
|
166
|
+
keys = cap_adata.uns.keys()
|
167
|
+
|
168
|
+
# read in memory the first key only
|
169
|
+
cap_adata.read_uns([keys[0]])
|
170
|
+
|
171
|
+
# read the whole uns section into memory
|
172
|
+
cap_adata.read_uns()
|
173
|
+
```
|
174
|
+
|
175
|
+
Since the `.uns` section is in the memory (partially or completely) we can work with it as with the regular `dict()` python object. The main feature of the `CapAnnDataUns` class which inherited from `dict` is the tracking of the keys which must be removed from the `.h5ad` file upon overwrite.
|
176
|
+
|
177
|
+
```python
|
178
|
+
# get the value
|
179
|
+
v = cap_adata.uns["key1"]
|
180
|
+
v = cap_adata.uns.get("key1")
|
181
|
+
|
182
|
+
# modify values
|
183
|
+
cap_adata.uns["key1"] = "new_value"
|
184
|
+
|
185
|
+
# create new keys
|
186
|
+
cap_adata.uns["new_key"] = "value"
|
187
|
+
|
188
|
+
# remove keys
|
189
|
+
cap_adata.uns.pop("key1") # is recommended way
|
190
|
+
del cap_adata.uns.pop("key2")
|
191
|
+
cap_adata.uns.popitem()
|
192
|
+
```
|
193
|
+
|
194
|
+
To save `uns` changes the method `CapAnnData.overwrite()` must be called.
|
195
|
+
|
196
|
+
```python
|
197
|
+
cap_adata.overwrite() # all in-memory fields will be overwritten
|
198
|
+
cap_adata.overwrite(["uns"]) # overwrite the uns secion only
|
199
|
+
```
|
@@ -0,0 +1,179 @@
|
|
1
|
+
# CAP-AnnData: Enhanced Partial I/O for AnnData Files
|
2
|
+
|
3
|
+
## Overview
|
4
|
+
CAP-AnnData enriches the AnnData ecosystem by offering tailored functionalities for partial reading and writing of AnnData files. This enhancement allows for selective manipulation of sections such as `obs`, `var`, `X`, `raw.X`, `obsm`, and `uns` without the need for loading entire datasets into memory. Leveraging AnnData's native methods, CAP-AnnData aims to maintain backward compatibility while improving efficiency, especially useful for large-scale single-cell genomics data.
|
5
|
+
|
6
|
+
## Getting Started
|
7
|
+
|
8
|
+
### Running Tests
|
9
|
+
Ensure the integrity and reliability of CAP-AnnData on your system by running the unit tests in `test/unit_test.py`.
|
10
|
+
|
11
|
+
Make sure Python 3.9 or newer is used, along with all requirements specified in requirements.txt
|
12
|
+
|
13
|
+
## How-TO:
|
14
|
+
|
15
|
+
#### 1. Read AnnData File Dataframes
|
16
|
+
|
17
|
+
##### Basic Reading
|
18
|
+
By default, `CapAnnData` does not automatically read any data. To begin working with dataframes, you need to explicitly read the data from the AnnData file. You can read the entire dataframe or select specific columns. For partial reading, provide a list of column names.
|
19
|
+
|
20
|
+
```python
|
21
|
+
import h5py
|
22
|
+
from cap_anndata import CapAnnData
|
23
|
+
|
24
|
+
file_path = "your_data.h5ad"
|
25
|
+
with h5py.File(file_path, 'r') as file:
|
26
|
+
cap_adata = CapAnnData(file)
|
27
|
+
|
28
|
+
# Read all columns of 'obs'
|
29
|
+
cap_adata.read_obs()
|
30
|
+
|
31
|
+
# Read specific columns of 'var'
|
32
|
+
cap_adata.read_var(columns=['gene_expression', 'dispersion'])
|
33
|
+
|
34
|
+
# Read all columns of raw.var
|
35
|
+
cap_adata.read_var(raw=True)
|
36
|
+
```
|
37
|
+
|
38
|
+
##### Non-existing columns
|
39
|
+
|
40
|
+
If a column doesn't exist in the file, no error will be raised but the column will be missing in the resulting Dataframe. So, the list of columns saying more like "try to read this columns from the file". It is needed because we there is no way yet to check if the column exists before the read.
|
41
|
+
|
42
|
+
#### 2. Modify the AnnData File Dataframes In-Place
|
43
|
+
|
44
|
+
You can directly modify the dataframe by adding, renaming, or removing columns.
|
45
|
+
|
46
|
+
```python
|
47
|
+
# Create a new column
|
48
|
+
cap_adata.obs['new_col'] = [value1, value2, value3]
|
49
|
+
|
50
|
+
# Rename a column
|
51
|
+
cap_adata.rename_column('old_col_name', 'new_col_name')
|
52
|
+
|
53
|
+
# Remove a column
|
54
|
+
cap_adata.remove_column('col_to_remove')
|
55
|
+
```
|
56
|
+
|
57
|
+
After modifications, you can overwrite the changes back to the AnnData file. If a value doesn't exist, it will be created.
|
58
|
+
|
59
|
+
```python
|
60
|
+
# overwrite all values which were read
|
61
|
+
cap_adata.overwrite()
|
62
|
+
|
63
|
+
# overwrite choosen fields
|
64
|
+
cap_adata.overwrite(['obs', 'var'])
|
65
|
+
```
|
66
|
+
|
67
|
+
The full list of supported fields: `X`, `raw.X`, `obs`, `var`, `raw.var`, `obsm`, `uns`.
|
68
|
+
|
69
|
+
#### 3. How to Read Few Columns but Overwrite One in a Dataframe
|
70
|
+
|
71
|
+
The only way yet to do that is to drop all columns from in-memory dataframe (with `pandas.drop`!) before the call of `overwrite` method.
|
72
|
+
|
73
|
+
```python
|
74
|
+
# Read specific columns
|
75
|
+
cap_adata.read_obs(columns=['cell_type', 'sample'])
|
76
|
+
|
77
|
+
# Drop a column in-memory
|
78
|
+
# DON'T USE remove_column here!
|
79
|
+
cap_adata.obs.drop(columns='sample', inplace=True)
|
80
|
+
|
81
|
+
# Overwrite changes
|
82
|
+
cap_adata.overwrite(['obs'])
|
83
|
+
```
|
84
|
+
|
85
|
+
#### 4. How to work with X and raw.X
|
86
|
+
|
87
|
+
The CapAnnData package won't read any field by default. However, the `X` and `raw.X` will be linked to the backed matrices automatically upon the first request to those fields.
|
88
|
+
|
89
|
+
```python
|
90
|
+
with h5py.File(path) as file:
|
91
|
+
# self.X is None here
|
92
|
+
cap_adata = CapAnnData(file)
|
93
|
+
|
94
|
+
# will return the h5py.Dataset or CSRDataset
|
95
|
+
x = cap_adata.X
|
96
|
+
|
97
|
+
# The same for raw.X
|
98
|
+
raw_x = cap_adata.raw.X
|
99
|
+
|
100
|
+
# take whole matrix in memory
|
101
|
+
x = cap_adata.X[:]
|
102
|
+
```
|
103
|
+
|
104
|
+
The CapAnnData supports the standard `numpy`/`h5py` sclising rules
|
105
|
+
|
106
|
+
```python
|
107
|
+
# slice rows
|
108
|
+
s_ = np.s_[0:5]
|
109
|
+
# slice columns
|
110
|
+
s_ = np.s_[:, 0:5]
|
111
|
+
# boolean mask + slicing
|
112
|
+
mask = np.array([i < 5 for i in range(adata.shape[0])])
|
113
|
+
s_ = np.s_[mask, :5]
|
114
|
+
```
|
115
|
+
|
116
|
+
#### 5. How to handle obsm embeddings matrixes
|
117
|
+
|
118
|
+
By the default the CapAnnData will not read the embeddings matrix. The link to the h5py objects will be created upon the first call of the `.obsm` property. Alike the AnnData package the call like `cap_adata.obsm["X_tsne"]` will not return the in-memory matrix but will return the backed version instead. We can get the information about the name and shape of the embeddings without taking the whole matrixes in the memory!
|
119
|
+
|
120
|
+
```python
|
121
|
+
with h5py.File(path) as file:
|
122
|
+
# initialization
|
123
|
+
cap_adata = CapAnnData(file)
|
124
|
+
|
125
|
+
# will return the list of strings
|
126
|
+
obsm_keys = cap_adata.obsm_keys()
|
127
|
+
|
128
|
+
# return the shape of the matrix in backed mode
|
129
|
+
embeddings = obsm_keys[0]
|
130
|
+
shape = cap_adata.obsm[embeddings].shape
|
131
|
+
|
132
|
+
# take the whole matrix in memory
|
133
|
+
matrix = cap_adata.obsm[embeddings][:]
|
134
|
+
```
|
135
|
+
|
136
|
+
#### 6. How to read and modify uns section
|
137
|
+
|
138
|
+
The `CapAnnData` class will lazely link the uns section upon the first call but ***WILL NOT*** read it into memory. Instead, the dictionary of the pairs `{'key': "__NotLinkedObject"}` will be creted. It allow to get the list of keys before the actual read. To read the uns section in the memory the `.read_uns(keys)` method must be called.
|
139
|
+
|
140
|
+
```python
|
141
|
+
with h5py.File(path) as file:
|
142
|
+
# initialization
|
143
|
+
cap_adata = CapAnnData(file)
|
144
|
+
|
145
|
+
# will return the keys() object
|
146
|
+
keys = cap_adata.uns.keys()
|
147
|
+
|
148
|
+
# read in memory the first key only
|
149
|
+
cap_adata.read_uns([keys[0]])
|
150
|
+
|
151
|
+
# read the whole uns section into memory
|
152
|
+
cap_adata.read_uns()
|
153
|
+
```
|
154
|
+
|
155
|
+
Since the `.uns` section is in the memory (partially or completely) we can work with it as with the regular `dict()` python object. The main feature of the `CapAnnDataUns` class which inherited from `dict` is the tracking of the keys which must be removed from the `.h5ad` file upon overwrite.
|
156
|
+
|
157
|
+
```python
|
158
|
+
# get the value
|
159
|
+
v = cap_adata.uns["key1"]
|
160
|
+
v = cap_adata.uns.get("key1")
|
161
|
+
|
162
|
+
# modify values
|
163
|
+
cap_adata.uns["key1"] = "new_value"
|
164
|
+
|
165
|
+
# create new keys
|
166
|
+
cap_adata.uns["new_key"] = "value"
|
167
|
+
|
168
|
+
# remove keys
|
169
|
+
cap_adata.uns.pop("key1") # is recommended way
|
170
|
+
del cap_adata.uns.pop("key2")
|
171
|
+
cap_adata.uns.popitem()
|
172
|
+
```
|
173
|
+
|
174
|
+
To save `uns` changes the method `CapAnnData.overwrite()` must be called.
|
175
|
+
|
176
|
+
```python
|
177
|
+
cap_adata.overwrite() # all in-memory fields will be overwritten
|
178
|
+
cap_adata.overwrite(["uns"]) # overwrite the uns secion only
|
179
|
+
```
|
@@ -0,0 +1,40 @@
|
|
1
|
+
import pandas as pd
|
2
|
+
import numpy as np
|
3
|
+
from typing import List
|
4
|
+
import logging
|
5
|
+
|
6
|
+
logger = logging.getLogger(__name__)
|
7
|
+
|
8
|
+
|
9
|
+
class CapAnnDataDF(pd.DataFrame):
|
10
|
+
"""
|
11
|
+
The class to expand the pandas DataFrame behaviour to support partial
|
12
|
+
reading and writing of AnnData obs and var (raw.var) fields.
|
13
|
+
The main feature of the class is handling <column-order> attribute
|
14
|
+
which must be a copy of h5py.Group attribute
|
15
|
+
"""
|
16
|
+
_metadata = ['column_order']
|
17
|
+
|
18
|
+
def rename_column(self, old_name: str, new_name: str) -> None:
|
19
|
+
i = np.where(self.column_order == old_name)[0]
|
20
|
+
self.column_order[i] = new_name
|
21
|
+
self.rename(columns={old_name: new_name}, inplace=True)
|
22
|
+
|
23
|
+
def remove_column(self, col_name: str) -> None:
|
24
|
+
i = np.where(self.column_order == col_name)[0]
|
25
|
+
self.column_order = np.delete(self.column_order, i)
|
26
|
+
self.drop(columns=[col_name], inplace=True)
|
27
|
+
|
28
|
+
def __setitem__(self, key, value) -> None:
|
29
|
+
if key not in self.column_order:
|
30
|
+
self.column_order = np.append(self.column_order, key)
|
31
|
+
return super().__setitem__(key, value)
|
32
|
+
|
33
|
+
@classmethod
|
34
|
+
def from_df(cls, df: pd.DataFrame, column_order: List[str] = None):
|
35
|
+
if column_order is None:
|
36
|
+
column_order = df.columns.to_numpy()
|
37
|
+
|
38
|
+
new_inst = cls(df)
|
39
|
+
new_inst.column_order = column_order
|
40
|
+
return new_inst
|
@@ -0,0 +1,28 @@
|
|
1
|
+
from typing import List, Any
|
2
|
+
|
3
|
+
|
4
|
+
class CapAnnDataUns(dict):
|
5
|
+
__keys_to_remove: List[str] = []
|
6
|
+
|
7
|
+
def __delitem__(self, __key: Any) -> None:
|
8
|
+
self.__keys_to_remove.append(__key)
|
9
|
+
return super().__delitem__(__key)
|
10
|
+
|
11
|
+
def __setitem__(self, __key: Any, __value: Any) -> None:
|
12
|
+
if __key in self.__keys_to_remove:
|
13
|
+
self.__keys_to_remove.remove(__key)
|
14
|
+
return super().__setitem__(__key, __value)
|
15
|
+
|
16
|
+
@property
|
17
|
+
def keys_to_remove(self):
|
18
|
+
return self.__keys_to_remove
|
19
|
+
|
20
|
+
def pop(self, __key: Any, __default: Any = None) -> Any:
|
21
|
+
if __key in self:
|
22
|
+
self.__keys_to_remove.append(__key)
|
23
|
+
return super().pop(__key, __default)
|
24
|
+
|
25
|
+
def popitem(self) -> Any:
|
26
|
+
item = super().popitem()
|
27
|
+
self.__keys_to_remove.append(item[0])
|
28
|
+
return item
|
@@ -0,0 +1,221 @@
|
|
1
|
+
import logging
|
2
|
+
import contextlib
|
3
|
+
import anndata as ad
|
4
|
+
import h5py
|
5
|
+
from typing import List, Union, Dict, Tuple, Final
|
6
|
+
from anndata._io.specs import read_elem, write_elem
|
7
|
+
from dataclasses import dataclass
|
8
|
+
|
9
|
+
from cap_anndata import CapAnnDataDF, CapAnnDataUns
|
10
|
+
|
11
|
+
logger = logging.getLogger(__name__)
|
12
|
+
|
13
|
+
X_NOTATION = Union[h5py.Dataset, ad.experimental.CSRDataset, ad.experimental.CSCDataset]
|
14
|
+
OBSM_NOTATION = Dict[str, X_NOTATION]
|
15
|
+
|
16
|
+
NotLinkedObject: Final = "__NotLinkedObject"
|
17
|
+
|
18
|
+
|
19
|
+
@dataclass
|
20
|
+
class RawLayer:
|
21
|
+
var: CapAnnDataDF = None
|
22
|
+
X: X_NOTATION = None
|
23
|
+
|
24
|
+
@property
|
25
|
+
def shape(self) -> Tuple[int, int]:
|
26
|
+
return self.X.shape if self.X is not None else None
|
27
|
+
|
28
|
+
|
29
|
+
class CapAnnData:
|
30
|
+
def __init__(self, h5_file: h5py.File) -> None:
|
31
|
+
self._file: h5py.File = h5_file
|
32
|
+
self.obs: CapAnnDataDF = None
|
33
|
+
self.var: CapAnnDataDF = None
|
34
|
+
self._X: X_NOTATION = None
|
35
|
+
self._obsm: OBSM_NOTATION = None
|
36
|
+
self._uns: CapAnnDataUns = None
|
37
|
+
self._raw: RawLayer = None
|
38
|
+
self._shape: Tuple[int, int] = None
|
39
|
+
|
40
|
+
@property
|
41
|
+
def X(self) -> X_NOTATION:
|
42
|
+
if self._X is None:
|
43
|
+
self._link_x()
|
44
|
+
return self._X
|
45
|
+
|
46
|
+
@property
|
47
|
+
def obsm(self) -> OBSM_NOTATION:
|
48
|
+
if self._obsm is None:
|
49
|
+
self._link_obsm()
|
50
|
+
return self._obsm
|
51
|
+
|
52
|
+
@property
|
53
|
+
def raw(self) -> RawLayer:
|
54
|
+
if self._raw is None:
|
55
|
+
self._link_raw_x()
|
56
|
+
return self._raw
|
57
|
+
|
58
|
+
@property
|
59
|
+
def uns(self) -> CapAnnDataUns:
|
60
|
+
if self._uns is None:
|
61
|
+
self._uns = CapAnnDataUns({k: NotLinkedObject for k in self._file["uns"].keys()})
|
62
|
+
return self._uns
|
63
|
+
|
64
|
+
def read_obs(self, columns: List[str] = None) -> None:
|
65
|
+
self.obs = self._read_df(self._file["obs"], columns=columns)
|
66
|
+
|
67
|
+
def read_var(self, columns: List[str] = None, raw: bool = False) -> None:
|
68
|
+
if raw:
|
69
|
+
# Check if raw exists first
|
70
|
+
if "raw" not in self._file.keys():
|
71
|
+
logger.debug("Can't read raw.var since raw layer doesn't exist!")
|
72
|
+
return
|
73
|
+
|
74
|
+
if self._raw is None:
|
75
|
+
self._raw = RawLayer()
|
76
|
+
self._link_raw_x()
|
77
|
+
|
78
|
+
key = "raw/var"
|
79
|
+
self._raw.var = self._read_df(self._file[key], columns=columns)
|
80
|
+
else:
|
81
|
+
key = "var"
|
82
|
+
self.var = self._read_df(self._file[key], columns=columns)
|
83
|
+
|
84
|
+
def _read_df(self, h5_group: h5py.Group, columns: List[str]) -> CapAnnDataDF:
|
85
|
+
column_order = self._read_attr(h5_group, "column-order")
|
86
|
+
|
87
|
+
if columns is None:
|
88
|
+
# read whole df
|
89
|
+
df = CapAnnDataDF.from_df(read_elem(h5_group), column_order=column_order)
|
90
|
+
else:
|
91
|
+
cols_to_read = [c for c in columns if c in column_order]
|
92
|
+
df = CapAnnDataDF()
|
93
|
+
df.column_order = column_order
|
94
|
+
|
95
|
+
index_col = self._read_attr(h5_group, "_index")
|
96
|
+
df.index = read_elem(h5_group[index_col])
|
97
|
+
|
98
|
+
for col in cols_to_read:
|
99
|
+
df[col] = read_elem(h5_group[col])
|
100
|
+
return df
|
101
|
+
|
102
|
+
@staticmethod
|
103
|
+
def _read_attr(obj: Union[h5py.Group, h5py.Dataset], attr_name: str) -> any:
|
104
|
+
attrs = dict(obj.attrs)
|
105
|
+
if attr_name not in attrs.keys():
|
106
|
+
raise KeyError(f"The {attr_name} doesn't exist!")
|
107
|
+
return attrs[attr_name]
|
108
|
+
|
109
|
+
def overwrite(self, fields: List[str] = None) -> None:
|
110
|
+
field_to_entity = {
|
111
|
+
"obs": self.obs,
|
112
|
+
"var": self.var,
|
113
|
+
"raw.var": self.raw.var if self.raw is not None else None,
|
114
|
+
"uns": self.uns
|
115
|
+
}
|
116
|
+
|
117
|
+
if fields is None:
|
118
|
+
fields = list(field_to_entity.keys())
|
119
|
+
else:
|
120
|
+
for f in fields:
|
121
|
+
if f not in field_to_entity.keys():
|
122
|
+
raise KeyError(
|
123
|
+
f"The field {f} is not supported! The list of suported fields are equal to supported attributes of the CapAnnData class: obs, var, raw.var and uns.")
|
124
|
+
|
125
|
+
for key in ["obs", "var", "raw.var"]:
|
126
|
+
if key in fields:
|
127
|
+
entity: CapAnnDataDF = field_to_entity[key]
|
128
|
+
if entity is None:
|
129
|
+
continue
|
130
|
+
|
131
|
+
key = key.replace(".", '/') if key == "raw.var" else key
|
132
|
+
|
133
|
+
for col in entity.columns:
|
134
|
+
self._write_elem_lzf(f"{key}/{col}", entity[col].values)
|
135
|
+
self._file[key].attrs['column-order'] = entity.column_order
|
136
|
+
|
137
|
+
if "uns" in fields:
|
138
|
+
for key in self.uns.keys():
|
139
|
+
if self.uns[key] is not NotLinkedObject:
|
140
|
+
dest = f"uns/{key}"
|
141
|
+
self._write_elem_lzf(dest, self.uns[key])
|
142
|
+
for key in self.uns.keys_to_remove:
|
143
|
+
del self._file[f"uns/{key}"]
|
144
|
+
|
145
|
+
def read_uns(self, keys: List[str] = None) -> None:
|
146
|
+
if keys is None:
|
147
|
+
keys = list(self.uns.keys())
|
148
|
+
|
149
|
+
for key in keys:
|
150
|
+
existing_keys = self.uns.keys()
|
151
|
+
if key in existing_keys:
|
152
|
+
sourse = self._file[f"uns/{key}"]
|
153
|
+
self.uns[key] = read_elem(sourse)
|
154
|
+
|
155
|
+
@property
|
156
|
+
def shape(self) -> tuple[int, int]:
|
157
|
+
return self.X.shape
|
158
|
+
|
159
|
+
def _link_x(self) -> None:
|
160
|
+
x = self._file["X"]
|
161
|
+
if isinstance(x, h5py.Dataset):
|
162
|
+
# dense X
|
163
|
+
self._X = x
|
164
|
+
else:
|
165
|
+
# sparse dataset
|
166
|
+
self._X = ad.experimental.sparse_dataset(x)
|
167
|
+
|
168
|
+
def _link_raw_x(self) -> None:
|
169
|
+
if "raw" in self._file.keys():
|
170
|
+
if self._raw is None:
|
171
|
+
self._raw = RawLayer()
|
172
|
+
|
173
|
+
raw_x = self._file["raw/X"]
|
174
|
+
if isinstance(raw_x, h5py.Dataset):
|
175
|
+
# dense X
|
176
|
+
self._raw.X = raw_x
|
177
|
+
else:
|
178
|
+
# sparse dataset
|
179
|
+
self._raw.X = ad.experimental.sparse_dataset(raw_x)
|
180
|
+
|
181
|
+
def _link_obsm(self) -> None:
|
182
|
+
self._obsm = {}
|
183
|
+
if "obsm" in self._file.keys():
|
184
|
+
obsm_group = self._file["obsm"]
|
185
|
+
for entity_name in obsm_group.keys():
|
186
|
+
entity = obsm_group[entity_name]
|
187
|
+
if isinstance(entity, h5py.Dataset):
|
188
|
+
# dense array
|
189
|
+
self._obsm[entity_name] = entity
|
190
|
+
else:
|
191
|
+
# sparse array
|
192
|
+
self._obsm[entity_name] = ad.experimental.sparse_dataset(entity)
|
193
|
+
logger.debug(f"obsm={self._obsm}")
|
194
|
+
|
195
|
+
def obsm_keys(self) -> List[str]:
|
196
|
+
return list(self.obsm.keys())
|
197
|
+
|
198
|
+
def _write_elem_lzf(self, dest_key: str, elem: any) -> None:
|
199
|
+
write_elem(self._file, dest_key, elem, dataset_kwargs={"compression": "lzf"})
|
200
|
+
|
201
|
+
@staticmethod
|
202
|
+
@contextlib.contextmanager
|
203
|
+
def read_anndata_file(file_path, backed='r'):
|
204
|
+
"""The method to read anndata file using original AnnData package"""
|
205
|
+
logger.debug(f"Read file {file_path} in backed mode = {backed}...")
|
206
|
+
|
207
|
+
adata = None
|
208
|
+
try:
|
209
|
+
adata = ad.read_h5ad(file_path, backed=backed)
|
210
|
+
logger.debug(f"Successfully read anndata file path {file_path}")
|
211
|
+
yield adata
|
212
|
+
|
213
|
+
except Exception as error:
|
214
|
+
logger.error(f"Error during read anndata file at path: {file_path}, error = {error}!")
|
215
|
+
raise error
|
216
|
+
|
217
|
+
finally:
|
218
|
+
if adata is not None:
|
219
|
+
if adata.isbacked:
|
220
|
+
adata.file.close()
|
221
|
+
logger.debug("AnnData closed!")
|
@@ -0,0 +1,199 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: cap_anndata
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: Partial read of AnnData files for low-memory operations with large datasets.
|
5
|
+
Home-page: https://github.com/cellannotation/cap-anndata
|
6
|
+
Author: R. Mukhin, A. Isaev
|
7
|
+
Author-email: roman@ebookapplications.com
|
8
|
+
Project-URL: Bug Tracker, https://github.com/cellannotation/cap-anndata/issues
|
9
|
+
Classifier: Programming Language :: Python :: 3.9
|
10
|
+
Classifier: License :: OSI Approved :: BSD License
|
11
|
+
Classifier: Operating System :: OS Independent
|
12
|
+
Requires-Python: >=3.9
|
13
|
+
Description-Content-Type: text/markdown
|
14
|
+
License-File: LICENSE
|
15
|
+
Requires-Dist: numpy>=1.26.3
|
16
|
+
Requires-Dist: pandas>=2.2.0
|
17
|
+
Requires-Dist: anndata>=0.10.5
|
18
|
+
Provides-Extra: dev
|
19
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
20
|
+
|
21
|
+
# CAP-AnnData: Enhanced Partial I/O for AnnData Files
|
22
|
+
|
23
|
+
## Overview
|
24
|
+
CAP-AnnData enriches the AnnData ecosystem by offering tailored functionalities for partial reading and writing of AnnData files. This enhancement allows for selective manipulation of sections such as `obs`, `var`, `X`, `raw.X`, `obsm`, and `uns` without the need for loading entire datasets into memory. Leveraging AnnData's native methods, CAP-AnnData aims to maintain backward compatibility while improving efficiency, especially useful for large-scale single-cell genomics data.
|
25
|
+
|
26
|
+
## Getting Started
|
27
|
+
|
28
|
+
### Running Tests
|
29
|
+
Ensure the integrity and reliability of CAP-AnnData on your system by running the unit tests in `test/unit_test.py`.
|
30
|
+
|
31
|
+
Make sure Python 3.9 or newer is used, along with all requirements specified in requirements.txt
|
32
|
+
|
33
|
+
## How-TO:
|
34
|
+
|
35
|
+
#### 1. Read AnnData File Dataframes
|
36
|
+
|
37
|
+
##### Basic Reading
|
38
|
+
By default, `CapAnnData` does not automatically read any data. To begin working with dataframes, you need to explicitly read the data from the AnnData file. You can read the entire dataframe or select specific columns. For partial reading, provide a list of column names.
|
39
|
+
|
40
|
+
```python
|
41
|
+
import h5py
|
42
|
+
from cap_anndata import CapAnnData
|
43
|
+
|
44
|
+
file_path = "your_data.h5ad"
|
45
|
+
with h5py.File(file_path, 'r') as file:
|
46
|
+
cap_adata = CapAnnData(file)
|
47
|
+
|
48
|
+
# Read all columns of 'obs'
|
49
|
+
cap_adata.read_obs()
|
50
|
+
|
51
|
+
# Read specific columns of 'var'
|
52
|
+
cap_adata.read_var(columns=['gene_expression', 'dispersion'])
|
53
|
+
|
54
|
+
# Read all columns of raw.var
|
55
|
+
cap_adata.read_var(raw=True)
|
56
|
+
```
|
57
|
+
|
58
|
+
##### Non-existing columns
|
59
|
+
|
60
|
+
If a column doesn't exist in the file, no error will be raised but the column will be missing in the resulting Dataframe. So, the list of columns saying more like "try to read this columns from the file". It is needed because we there is no way yet to check if the column exists before the read.
|
61
|
+
|
62
|
+
#### 2. Modify the AnnData File Dataframes In-Place
|
63
|
+
|
64
|
+
You can directly modify the dataframe by adding, renaming, or removing columns.
|
65
|
+
|
66
|
+
```python
|
67
|
+
# Create a new column
|
68
|
+
cap_adata.obs['new_col'] = [value1, value2, value3]
|
69
|
+
|
70
|
+
# Rename a column
|
71
|
+
cap_adata.rename_column('old_col_name', 'new_col_name')
|
72
|
+
|
73
|
+
# Remove a column
|
74
|
+
cap_adata.remove_column('col_to_remove')
|
75
|
+
```
|
76
|
+
|
77
|
+
After modifications, you can overwrite the changes back to the AnnData file. If a value doesn't exist, it will be created.
|
78
|
+
|
79
|
+
```python
|
80
|
+
# overwrite all values which were read
|
81
|
+
cap_adata.overwrite()
|
82
|
+
|
83
|
+
# overwrite choosen fields
|
84
|
+
cap_adata.overwrite(['obs', 'var'])
|
85
|
+
```
|
86
|
+
|
87
|
+
The full list of supported fields: `X`, `raw.X`, `obs`, `var`, `raw.var`, `obsm`, `uns`.
|
88
|
+
|
89
|
+
#### 3. How to Read Few Columns but Overwrite One in a Dataframe
|
90
|
+
|
91
|
+
The only way yet to do that is to drop all columns from in-memory dataframe (with `pandas.drop`!) before the call of `overwrite` method.
|
92
|
+
|
93
|
+
```python
|
94
|
+
# Read specific columns
|
95
|
+
cap_adata.read_obs(columns=['cell_type', 'sample'])
|
96
|
+
|
97
|
+
# Drop a column in-memory
|
98
|
+
# DON'T USE remove_column here!
|
99
|
+
cap_adata.obs.drop(columns='sample', inplace=True)
|
100
|
+
|
101
|
+
# Overwrite changes
|
102
|
+
cap_adata.overwrite(['obs'])
|
103
|
+
```
|
104
|
+
|
105
|
+
#### 4. How to work with X and raw.X
|
106
|
+
|
107
|
+
The CapAnnData package won't read any field by default. However, the `X` and `raw.X` will be linked to the backed matrices automatically upon the first request to those fields.
|
108
|
+
|
109
|
+
```python
|
110
|
+
with h5py.File(path) as file:
|
111
|
+
# self.X is None here
|
112
|
+
cap_adata = CapAnnData(file)
|
113
|
+
|
114
|
+
# will return the h5py.Dataset or CSRDataset
|
115
|
+
x = cap_adata.X
|
116
|
+
|
117
|
+
# The same for raw.X
|
118
|
+
raw_x = cap_adata.raw.X
|
119
|
+
|
120
|
+
# take whole matrix in memory
|
121
|
+
x = cap_adata.X[:]
|
122
|
+
```
|
123
|
+
|
124
|
+
The CapAnnData supports the standard `numpy`/`h5py` sclising rules
|
125
|
+
|
126
|
+
```python
|
127
|
+
# slice rows
|
128
|
+
s_ = np.s_[0:5]
|
129
|
+
# slice columns
|
130
|
+
s_ = np.s_[:, 0:5]
|
131
|
+
# boolean mask + slicing
|
132
|
+
mask = np.array([i < 5 for i in range(adata.shape[0])])
|
133
|
+
s_ = np.s_[mask, :5]
|
134
|
+
```
|
135
|
+
|
136
|
+
#### 5. How to handle obsm embeddings matrixes
|
137
|
+
|
138
|
+
By the default the CapAnnData will not read the embeddings matrix. The link to the h5py objects will be created upon the first call of the `.obsm` property. Alike the AnnData package the call like `cap_adata.obsm["X_tsne"]` will not return the in-memory matrix but will return the backed version instead. We can get the information about the name and shape of the embeddings without taking the whole matrixes in the memory!
|
139
|
+
|
140
|
+
```python
|
141
|
+
with h5py.File(path) as file:
|
142
|
+
# initialization
|
143
|
+
cap_adata = CapAnnData(file)
|
144
|
+
|
145
|
+
# will return the list of strings
|
146
|
+
obsm_keys = cap_adata.obsm_keys()
|
147
|
+
|
148
|
+
# return the shape of the matrix in backed mode
|
149
|
+
embeddings = obsm_keys[0]
|
150
|
+
shape = cap_adata.obsm[embeddings].shape
|
151
|
+
|
152
|
+
# take the whole matrix in memory
|
153
|
+
matrix = cap_adata.obsm[embeddings][:]
|
154
|
+
```
|
155
|
+
|
156
|
+
#### 6. How to read and modify uns section
|
157
|
+
|
158
|
+
The `CapAnnData` class will lazely link the uns section upon the first call but ***WILL NOT*** read it into memory. Instead, the dictionary of the pairs `{'key': "__NotLinkedObject"}` will be creted. It allow to get the list of keys before the actual read. To read the uns section in the memory the `.read_uns(keys)` method must be called.
|
159
|
+
|
160
|
+
```python
|
161
|
+
with h5py.File(path) as file:
|
162
|
+
# initialization
|
163
|
+
cap_adata = CapAnnData(file)
|
164
|
+
|
165
|
+
# will return the keys() object
|
166
|
+
keys = cap_adata.uns.keys()
|
167
|
+
|
168
|
+
# read in memory the first key only
|
169
|
+
cap_adata.read_uns([keys[0]])
|
170
|
+
|
171
|
+
# read the whole uns section into memory
|
172
|
+
cap_adata.read_uns()
|
173
|
+
```
|
174
|
+
|
175
|
+
Since the `.uns` section is in the memory (partially or completely) we can work with it as with the regular `dict()` python object. The main feature of the `CapAnnDataUns` class which inherited from `dict` is the tracking of the keys which must be removed from the `.h5ad` file upon overwrite.
|
176
|
+
|
177
|
+
```python
|
178
|
+
# get the value
|
179
|
+
v = cap_adata.uns["key1"]
|
180
|
+
v = cap_adata.uns.get("key1")
|
181
|
+
|
182
|
+
# modify values
|
183
|
+
cap_adata.uns["key1"] = "new_value"
|
184
|
+
|
185
|
+
# create new keys
|
186
|
+
cap_adata.uns["new_key"] = "value"
|
187
|
+
|
188
|
+
# remove keys
|
189
|
+
cap_adata.uns.pop("key1") # is recommended way
|
190
|
+
del cap_adata.uns.pop("key2")
|
191
|
+
cap_adata.uns.popitem()
|
192
|
+
```
|
193
|
+
|
194
|
+
To save `uns` changes the method `CapAnnData.overwrite()` must be called.
|
195
|
+
|
196
|
+
```python
|
197
|
+
cap_adata.overwrite() # all in-memory fields will be overwritten
|
198
|
+
cap_adata.overwrite(["uns"]) # overwrite the uns secion only
|
199
|
+
```
|
@@ -0,0 +1,15 @@
|
|
1
|
+
LICENSE
|
2
|
+
README.md
|
3
|
+
setup.py
|
4
|
+
cap_anndata/__init__.py
|
5
|
+
cap_anndata/backed_df.py
|
6
|
+
cap_anndata/backed_uns.py
|
7
|
+
cap_anndata/cap_anndata.py
|
8
|
+
cap_anndata.egg-info/PKG-INFO
|
9
|
+
cap_anndata.egg-info/SOURCES.txt
|
10
|
+
cap_anndata.egg-info/dependency_links.txt
|
11
|
+
cap_anndata.egg-info/requires.txt
|
12
|
+
cap_anndata.egg-info/top_level.txt
|
13
|
+
test/test_backed_df.py
|
14
|
+
test/test_backed_uns.py
|
15
|
+
test/test_cap_anndata.py
|
@@ -0,0 +1 @@
|
|
1
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
cap_anndata
|
@@ -0,0 +1,28 @@
|
|
1
|
+
from setuptools import setup, find_packages
|
2
|
+
|
3
|
+
setup(
|
4
|
+
name='cap_anndata',
|
5
|
+
version='0.1.0',
|
6
|
+
author='R. Mukhin, A. Isaev',
|
7
|
+
author_email='roman@ebookapplications.com',
|
8
|
+
packages=find_packages(exclude=["test"]),
|
9
|
+
description='Partial read of AnnData files for low-memory operations with large datasets.',
|
10
|
+
long_description=open('README.md').read(),
|
11
|
+
long_description_content_type='text/markdown',
|
12
|
+
url='https://github.com/cellannotation/cap-anndata',
|
13
|
+
project_urls={
|
14
|
+
"Bug Tracker": "https://github.com/cellannotation/cap-anndata/issues"
|
15
|
+
},
|
16
|
+
classifiers=[
|
17
|
+
"Programming Language :: Python :: 3.9",
|
18
|
+
"License :: OSI Approved :: BSD License",
|
19
|
+
"Operating System :: OS Independent",
|
20
|
+
],
|
21
|
+
python_requires='>=3.9',
|
22
|
+
install_requires=[
|
23
|
+
"numpy>=1.26.3",
|
24
|
+
"pandas>=2.2.0",
|
25
|
+
"anndata>=0.10.5"
|
26
|
+
],
|
27
|
+
extras_require={"dev": ["pytest>=8.0.0"]}
|
28
|
+
)
|
@@ -0,0 +1,58 @@
|
|
1
|
+
import pandas as pd
|
2
|
+
import numpy as np
|
3
|
+
from cap_anndata import CapAnnDataDF
|
4
|
+
|
5
|
+
|
6
|
+
|
7
|
+
def test_from_df():
|
8
|
+
data = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
9
|
+
cap_ann_data_df = CapAnnDataDF.from_df(data)
|
10
|
+
|
11
|
+
assert np.allclose(data.values, cap_ann_data_df.values)
|
12
|
+
assert all(data.columns == cap_ann_data_df.column_order)
|
13
|
+
|
14
|
+
|
15
|
+
def test_create_column():
|
16
|
+
data = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
17
|
+
cap_ann_data_df = CapAnnDataDF.from_df(data, column_order=["A", "B", "D"])
|
18
|
+
cap_ann_data_df["C"] = [7, 8, 9]
|
19
|
+
|
20
|
+
assert 'C' in cap_ann_data_df.columns
|
21
|
+
assert 'C' in cap_ann_data_df.column_order
|
22
|
+
|
23
|
+
|
24
|
+
def test_rename_column():
|
25
|
+
data = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
26
|
+
cap_ann_data_df = CapAnnDataDF.from_df(data)
|
27
|
+
cap_ann_data_df.rename_column('A', 'A_renamed')
|
28
|
+
|
29
|
+
assert 'A_renamed' in cap_ann_data_df.columns
|
30
|
+
assert 'A' not in cap_ann_data_df.columns
|
31
|
+
assert 'A_renamed' in cap_ann_data_df.column_order
|
32
|
+
|
33
|
+
|
34
|
+
def test_remove_column():
|
35
|
+
data = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
36
|
+
cap_ann_data_df = CapAnnDataDF.from_df(data)
|
37
|
+
cap_ann_data_df.remove_column('B')
|
38
|
+
|
39
|
+
assert 'B' not in cap_ann_data_df.columns
|
40
|
+
assert 'B' not in cap_ann_data_df.column_order
|
41
|
+
|
42
|
+
|
43
|
+
def test_from_df_class_method():
|
44
|
+
data = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
45
|
+
new_df = CapAnnDataDF.from_df(data, ['B', 'A'])
|
46
|
+
|
47
|
+
assert list(new_df.column_order) == ['B', 'A']
|
48
|
+
|
49
|
+
|
50
|
+
def test_column_order_integrity():
|
51
|
+
data = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
52
|
+
cap_ann_data_df = CapAnnDataDF.from_df(data)
|
53
|
+
cap_ann_data_df["C"] = [7, 8, 9]
|
54
|
+
cap_ann_data_df.rename_column('A', 'A_renamed')
|
55
|
+
cap_ann_data_df.remove_column('B')
|
56
|
+
|
57
|
+
expected_order = ['A_renamed', 'C']
|
58
|
+
assert list(cap_ann_data_df.column_order) == expected_order
|
@@ -0,0 +1,36 @@
|
|
1
|
+
from cap_anndata import CapAnnDataUns
|
2
|
+
|
3
|
+
|
4
|
+
def test_basic_init():
|
5
|
+
d = {'key1': 'value1', 'key2': {'value2': 'sub_value2'}}
|
6
|
+
cap_d = CapAnnDataUns(d)
|
7
|
+
assert cap_d == d
|
8
|
+
assert cap_d.keys_to_remove == []
|
9
|
+
assert cap_d.get("key1") == "value1"
|
10
|
+
assert cap_d.get("key_N") == None
|
11
|
+
|
12
|
+
|
13
|
+
def test_pop():
|
14
|
+
cap_d = CapAnnDataUns()
|
15
|
+
cap_d["key1"] = "value1"
|
16
|
+
cap_d["key2"] = "value2"
|
17
|
+
|
18
|
+
cap_d.pop("key1")
|
19
|
+
|
20
|
+
assert len(cap_d.keys()) == 1
|
21
|
+
assert cap_d.keys_to_remove == ["key1"]
|
22
|
+
|
23
|
+
cap_d["key1"] = "new_value"
|
24
|
+
|
25
|
+
assert len(cap_d.keys()) == 2
|
26
|
+
assert cap_d.keys_to_remove == []
|
27
|
+
|
28
|
+
cap_d.popitem()
|
29
|
+
assert len(cap_d.keys()) == 1
|
30
|
+
assert cap_d.keys_to_remove == ["key1"]
|
31
|
+
|
32
|
+
cap_d["key1"] = {'sk1': 'v1', "sk2": 'v2'}
|
33
|
+
cap_d["key1"].pop('sk2')
|
34
|
+
|
35
|
+
assert len(cap_d.keys()) == 2
|
36
|
+
assert cap_d.keys_to_remove == []
|
@@ -0,0 +1,291 @@
|
|
1
|
+
from cap_anndata import CapAnnData
|
2
|
+
import anndata as ad
|
3
|
+
import numpy as np
|
4
|
+
import tempfile
|
5
|
+
import os
|
6
|
+
import h5py
|
7
|
+
import pandas as pd
|
8
|
+
import scipy.sparse as sp
|
9
|
+
import pytest
|
10
|
+
|
11
|
+
|
12
|
+
def get_base_anndata(n_rows: int = 10, n_genes: int = 10, sparse=False) -> ad.AnnData:
|
13
|
+
x = np.eye(n_rows, n_genes).astype(np.float32)
|
14
|
+
if sparse:
|
15
|
+
x = sp.csr_matrix(x, dtype=np.float32)
|
16
|
+
adata = ad.AnnData(X=x)
|
17
|
+
return adata
|
18
|
+
|
19
|
+
|
20
|
+
def get_filled_anndata(n_rows: int = 10, n_genes: int = 10, sparse=False) -> ad.AnnData:
|
21
|
+
adata = get_base_anndata(n_rows, n_genes, sparse)
|
22
|
+
|
23
|
+
adata.obs["cell_type"] = [f"cell_{i%3}" for i in range(adata.shape[0])]
|
24
|
+
adata.obs["number"] = [i / 10 for i in range(adata.shape[0])]
|
25
|
+
adata.obs.index = [f"obs_{i}" for i in range(adata.shape[0])]
|
26
|
+
|
27
|
+
adata.var.index = [f"gene_{i}" for i in range(adata.shape[1])]
|
28
|
+
adata.var["filtered"] = [i > 4 for i in range(adata.shape[1])]
|
29
|
+
adata.var["gene_names"] = [f"gene_name_{i}" for i in range(adata.shape[1])]
|
30
|
+
adata.var["dispersion"] = [i / 100 for i in range(adata.shape[1])]
|
31
|
+
|
32
|
+
adata.raw = adata
|
33
|
+
return adata
|
34
|
+
|
35
|
+
|
36
|
+
def test_read_anndata_file():
|
37
|
+
adata = get_base_anndata()
|
38
|
+
temp_folder = tempfile.mkdtemp()
|
39
|
+
file_path = os.path.join(temp_folder, "test_read_anndata_file.h5ad")
|
40
|
+
adata.write_h5ad(file_path)
|
41
|
+
del adata
|
42
|
+
|
43
|
+
with CapAnnData.read_anndata_file(file_path=file_path) as adata:
|
44
|
+
assert adata is not None, "AnnData file must be valid!"
|
45
|
+
|
46
|
+
os.remove(file_path)
|
47
|
+
|
48
|
+
|
49
|
+
def test_read_shape():
|
50
|
+
n_rows = 10
|
51
|
+
n_genes = 20
|
52
|
+
adata = get_base_anndata(n_rows, n_genes)
|
53
|
+
temp_folder = tempfile.mkdtemp()
|
54
|
+
file_path = os.path.join(temp_folder, "test_read_shape.h5ad")
|
55
|
+
adata.write_h5ad(file_path)
|
56
|
+
|
57
|
+
with h5py.File(file_path) as file:
|
58
|
+
cap_adata = CapAnnData(file)
|
59
|
+
shape = cap_adata.shape
|
60
|
+
|
61
|
+
os.remove(file_path)
|
62
|
+
assert shape[0] == n_rows
|
63
|
+
assert shape[1] == n_genes
|
64
|
+
|
65
|
+
|
66
|
+
def test_read_df():
|
67
|
+
adata = get_filled_anndata()
|
68
|
+
temp_folder = tempfile.mkdtemp()
|
69
|
+
file_path = os.path.join(temp_folder, "test_read_obs.h5ad")
|
70
|
+
|
71
|
+
adata.write_h5ad(file_path)
|
72
|
+
|
73
|
+
with h5py.File(file_path, 'r') as file:
|
74
|
+
cap_adata = CapAnnData(file)
|
75
|
+
cap_adata.read_obs()
|
76
|
+
cap_adata.read_var()
|
77
|
+
cap_adata.read_var(raw=True)
|
78
|
+
|
79
|
+
os.remove(file_path)
|
80
|
+
pd.testing.assert_frame_equal(adata.obs, cap_adata.obs, check_frame_type=False)
|
81
|
+
pd.testing.assert_frame_equal(adata.var, cap_adata.var, check_frame_type=False)
|
82
|
+
pd.testing.assert_frame_equal(adata.raw.var, cap_adata.raw.var, check_frame_type=False)
|
83
|
+
|
84
|
+
|
85
|
+
def test_partial_read():
|
86
|
+
adata = get_filled_anndata()
|
87
|
+
temp_folder = tempfile.mkdtemp()
|
88
|
+
file_path = os.path.join(temp_folder, "test_partial_read.h5ad")
|
89
|
+
adata.write_h5ad(file_path)
|
90
|
+
|
91
|
+
with h5py.File(file_path, 'r') as file:
|
92
|
+
cap_adata = CapAnnData(file)
|
93
|
+
cap_adata.read_obs(columns=['cell_type'])
|
94
|
+
cap_adata.read_obs(columns=['cell_type'])
|
95
|
+
cap_adata.read_var(columns=['dispersion'])
|
96
|
+
cap_adata.read_var(columns=['dispersion'], raw=True)
|
97
|
+
|
98
|
+
os.remove(file_path)
|
99
|
+
|
100
|
+
assert len(adata.obs.columns) == len(cap_adata.obs.column_order)
|
101
|
+
assert len(adata.var.columns) == len(cap_adata.var.column_order)
|
102
|
+
assert len(adata.raw.var.columns) == len(cap_adata.raw.var.column_order)
|
103
|
+
|
104
|
+
assert len(cap_adata.obs.columns) == 1
|
105
|
+
assert len(cap_adata.var.columns) == 1
|
106
|
+
assert len(cap_adata.raw.var.columns) == 1
|
107
|
+
|
108
|
+
pd.testing.assert_index_equal(adata.obs.index, cap_adata.obs.index)
|
109
|
+
pd.testing.assert_index_equal(adata.var.index, cap_adata.var.index)
|
110
|
+
pd.testing.assert_index_equal(adata.raw.var.index, cap_adata.raw.var.index)
|
111
|
+
|
112
|
+
|
113
|
+
def test_overwrite_df():
|
114
|
+
adata = get_filled_anndata()
|
115
|
+
temp_folder = tempfile.mkdtemp()
|
116
|
+
file_path = os.path.join(temp_folder, "test_overwrite_df.h5ad")
|
117
|
+
adata.write_h5ad(file_path)
|
118
|
+
|
119
|
+
with h5py.File(file_path, 'r+') as file:
|
120
|
+
cap_adata = CapAnnData(file)
|
121
|
+
cap_adata.read_obs(columns=["cell_type"])
|
122
|
+
cap_adata.obs["cell_type"] = [f"new_cell_type_{i%2}" for i in range(cap_adata.shape[0])]
|
123
|
+
cap_adata.obs["const_str"] = "some string"
|
124
|
+
ref_obs = cap_adata.obs.copy()
|
125
|
+
|
126
|
+
# Modify 'var'
|
127
|
+
cap_adata.read_var()
|
128
|
+
cap_adata.var["gene_names"] = [f"new_gene_{i}" for i in range(cap_adata.shape[1])]
|
129
|
+
cap_adata.var["extra_info"] = np.random.rand(cap_adata.shape[1])
|
130
|
+
ref_var = cap_adata.var.copy()
|
131
|
+
|
132
|
+
# Modify 'raw.var', assuming 'raw' is also a CapAnnData
|
133
|
+
cap_adata.read_var(raw=True)
|
134
|
+
cap_adata.raw.var["gene_names"] = [f"raw_new_gene_{i}" for i in range(cap_adata.raw.shape[1])]
|
135
|
+
cap_adata.raw.var["extra_info"] = np.random.rand(cap_adata.shape[1])
|
136
|
+
ref_raw_var = cap_adata.raw.var.copy()
|
137
|
+
|
138
|
+
cap_adata.overwrite(['obs', 'var', 'raw.var'])
|
139
|
+
|
140
|
+
adata = ad.read_h5ad(file_path)
|
141
|
+
os.remove(file_path)
|
142
|
+
|
143
|
+
# Assert changes in 'obs'
|
144
|
+
assert all([c in adata.obs.columns for c in ref_obs.columns])
|
145
|
+
pd.testing.assert_frame_equal(ref_obs, adata.obs[ref_obs.columns.to_list()], check_frame_type=False)
|
146
|
+
|
147
|
+
# Assert changes in 'var'
|
148
|
+
assert all([c in adata.var.columns for c in ref_var.columns])
|
149
|
+
pd.testing.assert_frame_equal(ref_var, adata.var[ref_var.columns.to_list()], check_frame_type=False)
|
150
|
+
|
151
|
+
# Assert changes in 'raw.var'
|
152
|
+
assert all([c in adata.raw.var.columns for c in ref_raw_var.columns])
|
153
|
+
pd.testing.assert_frame_equal(ref_raw_var, adata.raw.var[ref_raw_var.columns.to_list()], check_frame_type=False)
|
154
|
+
|
155
|
+
|
156
|
+
@pytest.mark.parametrize("sparse", [False, True])
|
157
|
+
@pytest.mark.parametrize("vertical_slice", [None, False, True, "mask"])
|
158
|
+
def test_link_x(sparse, vertical_slice):
|
159
|
+
adata = get_filled_anndata(sparse=sparse)
|
160
|
+
temp_folder = tempfile.mkdtemp()
|
161
|
+
file_path = os.path.join(temp_folder, "test_link_x.h5ad")
|
162
|
+
adata.write_h5ad(file_path)
|
163
|
+
|
164
|
+
if vertical_slice is None:
|
165
|
+
s_ = np.s_[:]
|
166
|
+
elif vertical_slice == "mask":
|
167
|
+
mask = np.array([i < 5 for i in range(adata.shape[0])])
|
168
|
+
s_ = np.s_[mask, :5]
|
169
|
+
else:
|
170
|
+
# slice over var or obs
|
171
|
+
s_ = np.s_[:, 0:5] if vertical_slice else np.s_[0:5, :]
|
172
|
+
|
173
|
+
with h5py.File(file_path, 'r') as file:
|
174
|
+
cap_adata = CapAnnData(file)
|
175
|
+
x = cap_adata.X[s_]
|
176
|
+
raw_x = cap_adata.raw.X[s_]
|
177
|
+
|
178
|
+
os.remove(file_path)
|
179
|
+
if sparse:
|
180
|
+
assert np.allclose(adata.X.A[s_], x.A)
|
181
|
+
assert np.allclose(adata.raw.X.A[s_], raw_x.A)
|
182
|
+
else:
|
183
|
+
assert np.allclose(adata.X[s_], x)
|
184
|
+
assert np.allclose(adata.raw.X[s_], raw_x)
|
185
|
+
|
186
|
+
|
187
|
+
@pytest.mark.parametrize("sparse", [False, True])
|
188
|
+
def test_shape(sparse):
|
189
|
+
n_rows = 15
|
190
|
+
n_genes = 25
|
191
|
+
|
192
|
+
adata = get_filled_anndata(n_rows, n_genes, sparse)
|
193
|
+
temp_folder = tempfile.mkdtemp()
|
194
|
+
file_path = os.path.join(temp_folder, "test_shape.h5ad")
|
195
|
+
adata.write_h5ad(file_path)
|
196
|
+
|
197
|
+
with h5py.File(file_path) as file:
|
198
|
+
cap_adata = CapAnnData(file)
|
199
|
+
shape = cap_adata.shape
|
200
|
+
shape_raw = cap_adata.raw.shape
|
201
|
+
|
202
|
+
os.remove(file_path)
|
203
|
+
for sh in [shape, shape_raw]:
|
204
|
+
assert sh == (n_rows, n_genes)
|
205
|
+
|
206
|
+
|
207
|
+
def test_read_obsm():
|
208
|
+
adata = get_filled_anndata()
|
209
|
+
obsm_names = [f"X_test{i}" for i in range(2)]
|
210
|
+
|
211
|
+
for emb in obsm_names:
|
212
|
+
adata.obsm[emb] = np.random.random(size=(adata.shape[0], 2))
|
213
|
+
|
214
|
+
temp_folder = tempfile.mkdtemp()
|
215
|
+
file_path = os.path.join(temp_folder, "test_read_obsm.h5ad")
|
216
|
+
adata.write_h5ad(file_path)
|
217
|
+
|
218
|
+
with h5py.File(file_path, 'r') as f:
|
219
|
+
cap_adata = CapAnnData(f)
|
220
|
+
|
221
|
+
ss = []
|
222
|
+
for emb in obsm_names:
|
223
|
+
assert emb in cap_adata.obsm_keys()
|
224
|
+
assert cap_adata.obsm[emb].shape == adata.obsm[emb].shape
|
225
|
+
|
226
|
+
x_1 = cap_adata.obsm[obsm_names[0]][:]
|
227
|
+
x_2 = cap_adata.obsm[obsm_names[1]][:]
|
228
|
+
|
229
|
+
os.remove(file_path)
|
230
|
+
assert np.allclose(adata.obsm[obsm_names[0]], x_1)
|
231
|
+
assert np.allclose(adata.obsm[obsm_names[1]], x_2)
|
232
|
+
|
233
|
+
|
234
|
+
def test_read_uns():
|
235
|
+
adata = get_base_anndata()
|
236
|
+
key1, key2 = "key1", "key2"
|
237
|
+
keys = (key1, key2)
|
238
|
+
|
239
|
+
adata.uns = {k: {k: k} for k in keys}
|
240
|
+
temp_folder = tempfile.mkdtemp()
|
241
|
+
file_path = os.path.join(temp_folder, "test_read_uns.h5ad")
|
242
|
+
adata.write_h5ad(file_path)
|
243
|
+
|
244
|
+
with h5py.File(file_path, 'r') as f:
|
245
|
+
cap_adata = CapAnnData(f)
|
246
|
+
|
247
|
+
for k in keys:
|
248
|
+
assert k in cap_adata.uns
|
249
|
+
|
250
|
+
cap_adata.read_uns(keys=[key1])
|
251
|
+
|
252
|
+
assert cap_adata.uns[key1] == adata.uns[key1] # connected
|
253
|
+
assert cap_adata.uns[key2] != adata.uns[key2] # not connected
|
254
|
+
|
255
|
+
os.remove(file_path)
|
256
|
+
|
257
|
+
|
258
|
+
def test_modify_uns():
|
259
|
+
adata = get_base_anndata()
|
260
|
+
adata.uns = {
|
261
|
+
"field_to_ingore": list(range(100)),
|
262
|
+
"field_to_rename": "value",
|
263
|
+
"field_to_expand": {"key1": {}},
|
264
|
+
"field_to_modify": {"a": "b"}
|
265
|
+
}
|
266
|
+
new_name = "renamed_field"
|
267
|
+
d_to_exp = {"sub_key1": "v1", "sub_key2": "v2"}
|
268
|
+
v_to_mod = "value"
|
269
|
+
|
270
|
+
temp_folder = tempfile.mkdtemp()
|
271
|
+
file_path = os.path.join(temp_folder, "test_modify_uns.h5ad")
|
272
|
+
adata.write_h5ad(file_path)
|
273
|
+
|
274
|
+
with h5py.File(file_path, 'r+') as f:
|
275
|
+
cap_adata = CapAnnData(f)
|
276
|
+
|
277
|
+
cap_adata.read_uns(keys=["field_to_rename", "field_to_expand", "field_to_modify"])
|
278
|
+
|
279
|
+
cap_adata.uns[new_name] = cap_adata.uns.pop("field_to_rename")
|
280
|
+
cap_adata.uns["field_to_expand"]["key1"] = d_to_exp
|
281
|
+
cap_adata.uns["field_to_modify"] = v_to_mod
|
282
|
+
|
283
|
+
cap_adata.overwrite(['uns'])
|
284
|
+
|
285
|
+
adata = ad.read_h5ad(file_path)
|
286
|
+
|
287
|
+
assert adata.uns is not None
|
288
|
+
assert len(adata.uns.keys()) == 4
|
289
|
+
assert new_name in adata.uns.keys()
|
290
|
+
assert adata.uns['field_to_expand']["key1"] == d_to_exp
|
291
|
+
assert adata.uns['field_to_modify'] == v_to_mod
|